DataCamp offers several interactive courses related to R Programming. While much of it is review, it is always helpful to see other perspectives on material. As well, DataCamp has some interesting materials on packages that I want to learn better (ggplot2, dplyr, ggvis, etc.). This document summarizes a few key insights from:
An additional document will be maintained for several of the more statistical areas of the Data Camp offering, as well as for the few courses offered in Python.
There are a few nuggest from within these beginning modules, including:
Below is some sample code showing examples for the generic statements:
# Factors
xRaw = c("High", "High", "Low", "Low", "Medium", "Very High", "Low")
xFactorNon = factor(xRaw, levels=c("Low", "Medium", "High", "Very High"))
xFactorNon
## [1] High High Low Low Medium Very High Low
## Levels: Low Medium High Very High
xFactorNon[xFactorNon == "High"] > xFactorNon[xFactorNon == "Low"][1]
## Warning in Ops.factor(xFactorNon[xFactorNon == "High"],
## xFactorNon[xFactorNon == : '>' not meaningful for factors
## [1] NA NA
xFactorOrder = factor(xRaw, ordered=TRUE, levels=c("Low", "Medium", "High", "Very High"))
xFactorOrder
## [1] High High Low Low Medium Very High Low
## Levels: Low < Medium < High < Very High
xFactorOrder[xFactorOrder == "High"] > xFactorOrder[xFactorOrder == "Low"][1]
## [1] TRUE TRUE
# Subsets
data(mtcars)
subset(mtcars, mpg>=25)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
identical(subset(mtcars, mpg>=25), mtcars[mtcars$mpg>=25, ])
## [1] TRUE
subset(mtcars, mpg>25, select=c("mpg", "cyl", "disp"))
## mpg cyl disp
## Fiat 128 32.4 4 78.7
## Honda Civic 30.4 4 75.7
## Toyota Corolla 33.9 4 71.1
## Fiat X1-9 27.3 4 79.0
## Porsche 914-2 26.0 4 120.3
## Lotus Europa 30.4 4 95.1
# & and && (same as | and ||)
compA <- c(2, 3, 4, 1, 2, 3)
compB <- c(1, 2, 3, 4, 5, 6)
(compA > compB) & (compA + compB < 6)
## [1] TRUE TRUE FALSE FALSE FALSE FALSE
(compA > compB) | (compA + compB < 6)
## [1] TRUE TRUE TRUE TRUE FALSE FALSE
(compA > compB) && (compA + compB < 6)
## [1] TRUE
(compA > compB) || (compA + compB < 6)
## [1] TRUE
# Loops and cat()
# for (a in b) {
# do stuff
# if (exitCond) { break }
# if (nextCond) { next }
# do some more stuff
# }
for (myVal in compA*compB) {
print(paste0("myVal is: ", myVal))
if ((myVal %% 3) == 0) { cat("Divisible by 3, not happy about that\n\n"); next }
print("That is not divisible by 3")
if ((myVal %% 5) == 0) { cat("Exiting due to divisible by 5 but not divisible by 3\n\n"); break }
cat("Onwards and upwards\n\n")
}
## [1] "myVal is: 2"
## [1] "That is not divisible by 3"
## Onwards and upwards
##
## [1] "myVal is: 6"
## Divisible by 3, not happy about that
##
## [1] "myVal is: 12"
## Divisible by 3, not happy about that
##
## [1] "myVal is: 4"
## [1] "That is not divisible by 3"
## Onwards and upwards
##
## [1] "myVal is: 10"
## [1] "That is not divisible by 3"
## Exiting due to divisible by 5 but not divisible by 3
# args() and search()
args(plot.default)
## function (x, y = NULL, type = "p", xlim = NULL, ylim = NULL,
## log = "", main = NULL, sub = NULL, xlab = NULL, ylab = NULL,
## ann = par("ann"), axes = TRUE, frame.plot = axes, panel.first = NULL,
## panel.last = NULL, asp = NA, ...)
## NULL
search()
## [1] ".GlobalEnv" "package:stats" "package:graphics"
## [4] "package:grDevices" "package:utils" "package:datasets"
## [7] "package:methods" "Autoloads" "package:base"
# unique()
compA
## [1] 2 3 4 1 2 3
unique(compA)
## [1] 2 3 4 1
# unlist()
listA <- as.list(compA)
unlist(listA)
## [1] 2 3 4 1 2 3
identical(compA, unlist(listA))
## [1] TRUE
# sort()
sort(mtcars$mpg)
## [1] 10.4 10.4 13.3 14.3 14.7 15.0 15.2 15.2 15.5 15.8 16.4 17.3 17.8 18.1
## [15] 18.7 19.2 19.2 19.7 21.0 21.0 21.4 21.4 21.5 22.8 22.8 24.4 26.0 27.3
## [29] 30.4 30.4 32.4 33.9
sort(mtcars$mpg, decreasing=TRUE)
## [1] 33.9 32.4 30.4 30.4 27.3 26.0 24.4 22.8 22.8 21.5 21.4 21.4 21.0 21.0
## [15] 19.7 19.2 19.2 18.7 18.1 17.8 17.3 16.4 15.8 15.5 15.2 15.2 15.0 14.7
## [29] 14.3 13.3 10.4 10.4
# rep()
rep(1:6, times=2) # 1:6 followed by 1:6
## [1] 1 2 3 4 5 6 1 2 3 4 5 6
rep(1:6, each=2) # 1 1 2 2 3 3 4 4 5 5 6 6
## [1] 1 1 2 2 3 3 4 4 5 5 6 6
rep(1:6, times=2, each=3) # 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 repeated twice (each comes first)
## [1] 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6
## [36] 6
rep(1:6, times=6:1) # 1 1 1 1 1 1 2 2 2 2 2 3 3 3 3 4 4 4 5 5 6
## [1] 1 1 1 1 1 1 2 2 2 2 2 3 3 3 3 4 4 4 5 5 6
# append()
myWords <- c("The", "cat", "in", "the", "hat")
paste(append(myWords, c("is", "fun", "to", "read")), collapse=" ")
## [1] "The cat in the hat is fun to read"
paste(append(myWords, "funny", 4), collapse=" ")
## [1] "The cat in the funny hat"
# grep("//1")
sampMsg <- "This is from myname@subdomain.mydomain.com again"
gsub("(^.*\\w*[a-zA-Z0-9]+@)([a-zA-Z0-9]+\\.[a-zA-Z0-9.]+)(.*$)", "\\1", sampMsg)
## [1] "This is from myname@"
gsub("(^.*\\w*[a-zA-Z0-9]+@)([a-zA-Z0-9]+\\.[a-zA-Z0-9.]+)(.*$)", "\\2", sampMsg)
## [1] "subdomain.mydomain.com"
gsub("(^.*\\w*[a-zA-Z0-9]+@)([a-zA-Z0-9]+\\.[a-zA-Z0-9.]+)(.*$)", "\\3", sampMsg)
## [1] " again"
# rev()
compA
## [1] 2 3 4 1 2 3
rev(compA)
## [1] 3 2 1 4 3 2
Below is some sample code showing examples for the apply statements:
# lapply
args(lapply)
## function (X, FUN, ...)
## NULL
lapply(1:5, FUN=sqrt)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 1.414214
##
## [[3]]
## [1] 1.732051
##
## [[4]]
## [1] 2
##
## [[5]]
## [1] 2.236068
lapply(1:5, FUN=function(x, y=2) { c(x=x, y=y, pow=x^y) }, y=3)
## [[1]]
## x y pow
## 1 3 1
##
## [[2]]
## x y pow
## 2 3 8
##
## [[3]]
## x y pow
## 3 3 27
##
## [[4]]
## x y pow
## 4 3 64
##
## [[5]]
## x y pow
## 5 3 125
lapply(1:5, FUN=function(x, y=2) { if (x <= 3) {c(x=x, y=y, pow=x^y) } else { c(pow=x^y) } }, y=3)
## [[1]]
## x y pow
## 1 3 1
##
## [[2]]
## x y pow
## 2 3 8
##
## [[3]]
## x y pow
## 3 3 27
##
## [[4]]
## pow
## 64
##
## [[5]]
## pow
## 125
# sapply (defaults to returning a named vector/array if possible; is lapply otherwise)
args(sapply)
## function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)
## NULL
args(simplify2array)
## function (x, higher = TRUE)
## NULL
sapply(1:5, FUN=sqrt)
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068
sapply(1:5, FUN=function(x, y=2) { c(x=x, y=y, pow=x^y) }, y=3)
## [,1] [,2] [,3] [,4] [,5]
## x 1 2 3 4 5
## y 3 3 3 3 3
## pow 1 8 27 64 125
sapply(1:5, FUN=function(x, y=2) { if (x <= 3) {c(x=x, y=y, pow=x^y) } else { c(pow=x^y) } }, y=3)
## [[1]]
## x y pow
## 1 3 1
##
## [[2]]
## x y pow
## 2 3 8
##
## [[3]]
## x y pow
## 3 3 27
##
## [[4]]
## pow
## 64
##
## [[5]]
## pow
## 125
# vapply (tells sapply exactly what should be returned; errors out otherwise)
args(vapply)
## function (X, FUN, FUN.VALUE, ..., USE.NAMES = TRUE)
## NULL
vapply(1:5, FUN=sqrt, FUN.VALUE=numeric(1))
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068
vapply(1:5, FUN=function(x, y=2) { c(x=x, y=y, pow=x^y) }, FUN.VALUE=numeric(3), y=3)
## [,1] [,2] [,3] [,4] [,5]
## x 1 2 3 4 5
## y 3 3 3 3 3
## pow 1 8 27 64 125
Below is some sample code for handing dates and times in R:
Sys.Date()
## [1] "2017-02-14"
Sys.time()
## [1] "2017-02-14 08:16:39 CST"
args(strptime)
## function (x, format, tz = "")
## NULL
rightNow <- as.POSIXct(Sys.time())
format(rightNow, "%Y**%M-%d %H hours and %M minutes", usetz=TRUE)
## [1] "2017**16-14 08 hours and 16 minutes CST"
lastChristmasNoon <- as.POSIXct("2015-12-25 12:00:00", format="%Y-%m-%d %X")
rightNow - lastChristmasNoon
## Time difference of 416.8449 days
nextUMHomeGame <- as.POSIXct("16/SEP/3 12:00:00", format="%y/%b/%d %H:%M:%S", tz="America/Detroit")
nextUMHomeGame - rightNow
## Time difference of -163.9282 days
# Time zones available in R
OlsonNames()
## [1] "Africa/Abidjan" "Africa/Accra"
## [3] "Africa/Addis_Ababa" "Africa/Algiers"
## [5] "Africa/Asmara" "Africa/Asmera"
## [7] "Africa/Bamako" "Africa/Bangui"
## [9] "Africa/Banjul" "Africa/Bissau"
## [11] "Africa/Blantyre" "Africa/Brazzaville"
## [13] "Africa/Bujumbura" "Africa/Cairo"
## [15] "Africa/Casablanca" "Africa/Ceuta"
## [17] "Africa/Conakry" "Africa/Dakar"
## [19] "Africa/Dar_es_Salaam" "Africa/Djibouti"
## [21] "Africa/Douala" "Africa/El_Aaiun"
## [23] "Africa/Freetown" "Africa/Gaborone"
## [25] "Africa/Harare" "Africa/Johannesburg"
## [27] "Africa/Juba" "Africa/Kampala"
## [29] "Africa/Khartoum" "Africa/Kigali"
## [31] "Africa/Kinshasa" "Africa/Lagos"
## [33] "Africa/Libreville" "Africa/Lome"
## [35] "Africa/Luanda" "Africa/Lubumbashi"
## [37] "Africa/Lusaka" "Africa/Malabo"
## [39] "Africa/Maputo" "Africa/Maseru"
## [41] "Africa/Mbabane" "Africa/Mogadishu"
## [43] "Africa/Monrovia" "Africa/Nairobi"
## [45] "Africa/Ndjamena" "Africa/Niamey"
## [47] "Africa/Nouakchott" "Africa/Ouagadougou"
## [49] "Africa/Porto-Novo" "Africa/Sao_Tome"
## [51] "Africa/Timbuktu" "Africa/Tripoli"
## [53] "Africa/Tunis" "Africa/Windhoek"
## [55] "America/Adak" "America/Anchorage"
## [57] "America/Anguilla" "America/Antigua"
## [59] "America/Araguaina" "America/Argentina/Buenos_Aires"
## [61] "America/Argentina/Catamarca" "America/Argentina/ComodRivadavia"
## [63] "America/Argentina/Cordoba" "America/Argentina/Jujuy"
## [65] "America/Argentina/La_Rioja" "America/Argentina/Mendoza"
## [67] "America/Argentina/Rio_Gallegos" "America/Argentina/Salta"
## [69] "America/Argentina/San_Juan" "America/Argentina/San_Luis"
## [71] "America/Argentina/Tucuman" "America/Argentina/Ushuaia"
## [73] "America/Aruba" "America/Asuncion"
## [75] "America/Atikokan" "America/Atka"
## [77] "America/Bahia" "America/Bahia_Banderas"
## [79] "America/Barbados" "America/Belem"
## [81] "America/Belize" "America/Blanc-Sablon"
## [83] "America/Boa_Vista" "America/Bogota"
## [85] "America/Boise" "America/Buenos_Aires"
## [87] "America/Cambridge_Bay" "America/Campo_Grande"
## [89] "America/Cancun" "America/Caracas"
## [91] "America/Catamarca" "America/Cayenne"
## [93] "America/Cayman" "America/Chicago"
## [95] "America/Chihuahua" "America/Coral_Harbour"
## [97] "America/Cordoba" "America/Costa_Rica"
## [99] "America/Creston" "America/Cuiaba"
## [101] "America/Curacao" "America/Danmarkshavn"
## [103] "America/Dawson" "America/Dawson_Creek"
## [105] "America/Denver" "America/Detroit"
## [107] "America/Dominica" "America/Edmonton"
## [109] "America/Eirunepe" "America/El_Salvador"
## [111] "America/Ensenada" "America/Fort_Nelson"
## [113] "America/Fort_Wayne" "America/Fortaleza"
## [115] "America/Glace_Bay" "America/Godthab"
## [117] "America/Goose_Bay" "America/Grand_Turk"
## [119] "America/Grenada" "America/Guadeloupe"
## [121] "America/Guatemala" "America/Guayaquil"
## [123] "America/Guyana" "America/Halifax"
## [125] "America/Havana" "America/Hermosillo"
## [127] "America/Indiana/Indianapolis" "America/Indiana/Knox"
## [129] "America/Indiana/Marengo" "America/Indiana/Petersburg"
## [131] "America/Indiana/Tell_City" "America/Indiana/Vevay"
## [133] "America/Indiana/Vincennes" "America/Indiana/Winamac"
## [135] "America/Indianapolis" "America/Inuvik"
## [137] "America/Iqaluit" "America/Jamaica"
## [139] "America/Jujuy" "America/Juneau"
## [141] "America/Kentucky/Louisville" "America/Kentucky/Monticello"
## [143] "America/Knox_IN" "America/Kralendijk"
## [145] "America/La_Paz" "America/Lima"
## [147] "America/Los_Angeles" "America/Louisville"
## [149] "America/Lower_Princes" "America/Maceio"
## [151] "America/Managua" "America/Manaus"
## [153] "America/Marigot" "America/Martinique"
## [155] "America/Matamoros" "America/Mazatlan"
## [157] "America/Mendoza" "America/Menominee"
## [159] "America/Merida" "America/Metlakatla"
## [161] "America/Mexico_City" "America/Miquelon"
## [163] "America/Moncton" "America/Monterrey"
## [165] "America/Montevideo" "America/Montreal"
## [167] "America/Montserrat" "America/Nassau"
## [169] "America/New_York" "America/Nipigon"
## [171] "America/Nome" "America/Noronha"
## [173] "America/North_Dakota/Beulah" "America/North_Dakota/Center"
## [175] "America/North_Dakota/New_Salem" "America/Ojinaga"
## [177] "America/Panama" "America/Pangnirtung"
## [179] "America/Paramaribo" "America/Phoenix"
## [181] "America/Port-au-Prince" "America/Port_of_Spain"
## [183] "America/Porto_Acre" "America/Porto_Velho"
## [185] "America/Puerto_Rico" "America/Rainy_River"
## [187] "America/Rankin_Inlet" "America/Recife"
## [189] "America/Regina" "America/Resolute"
## [191] "America/Rio_Branco" "America/Rosario"
## [193] "America/Santa_Isabel" "America/Santarem"
## [195] "America/Santiago" "America/Santo_Domingo"
## [197] "America/Sao_Paulo" "America/Scoresbysund"
## [199] "America/Shiprock" "America/Sitka"
## [201] "America/St_Barthelemy" "America/St_Johns"
## [203] "America/St_Kitts" "America/St_Lucia"
## [205] "America/St_Thomas" "America/St_Vincent"
## [207] "America/Swift_Current" "America/Tegucigalpa"
## [209] "America/Thule" "America/Thunder_Bay"
## [211] "America/Tijuana" "America/Toronto"
## [213] "America/Tortola" "America/Vancouver"
## [215] "America/Virgin" "America/Whitehorse"
## [217] "America/Winnipeg" "America/Yakutat"
## [219] "America/Yellowknife" "Antarctica/Casey"
## [221] "Antarctica/Davis" "Antarctica/DumontDUrville"
## [223] "Antarctica/Macquarie" "Antarctica/Mawson"
## [225] "Antarctica/McMurdo" "Antarctica/Palmer"
## [227] "Antarctica/Rothera" "Antarctica/South_Pole"
## [229] "Antarctica/Syowa" "Antarctica/Troll"
## [231] "Antarctica/Vostok" "Arctic/Longyearbyen"
## [233] "Asia/Aden" "Asia/Almaty"
## [235] "Asia/Amman" "Asia/Anadyr"
## [237] "Asia/Aqtau" "Asia/Aqtobe"
## [239] "Asia/Ashgabat" "Asia/Ashkhabad"
## [241] "Asia/Baghdad" "Asia/Bahrain"
## [243] "Asia/Baku" "Asia/Bangkok"
## [245] "Asia/Beirut" "Asia/Bishkek"
## [247] "Asia/Brunei" "Asia/Calcutta"
## [249] "Asia/Chita" "Asia/Choibalsan"
## [251] "Asia/Chongqing" "Asia/Chungking"
## [253] "Asia/Colombo" "Asia/Dacca"
## [255] "Asia/Damascus" "Asia/Dhaka"
## [257] "Asia/Dili" "Asia/Dubai"
## [259] "Asia/Dushanbe" "Asia/Gaza"
## [261] "Asia/Harbin" "Asia/Hebron"
## [263] "Asia/Ho_Chi_Minh" "Asia/Hong_Kong"
## [265] "Asia/Hovd" "Asia/Irkutsk"
## [267] "Asia/Istanbul" "Asia/Jakarta"
## [269] "Asia/Jayapura" "Asia/Jerusalem"
## [271] "Asia/Kabul" "Asia/Kamchatka"
## [273] "Asia/Karachi" "Asia/Kashgar"
## [275] "Asia/Kathmandu" "Asia/Katmandu"
## [277] "Asia/Khandyga" "Asia/Kolkata"
## [279] "Asia/Krasnoyarsk" "Asia/Kuala_Lumpur"
## [281] "Asia/Kuching" "Asia/Kuwait"
## [283] "Asia/Macao" "Asia/Macau"
## [285] "Asia/Magadan" "Asia/Makassar"
## [287] "Asia/Manila" "Asia/Muscat"
## [289] "Asia/Nicosia" "Asia/Novokuznetsk"
## [291] "Asia/Novosibirsk" "Asia/Omsk"
## [293] "Asia/Oral" "Asia/Phnom_Penh"
## [295] "Asia/Pontianak" "Asia/Pyongyang"
## [297] "Asia/Qatar" "Asia/Qyzylorda"
## [299] "Asia/Rangoon" "Asia/Riyadh"
## [301] "Asia/Saigon" "Asia/Sakhalin"
## [303] "Asia/Samarkand" "Asia/Seoul"
## [305] "Asia/Shanghai" "Asia/Singapore"
## [307] "Asia/Srednekolymsk" "Asia/Taipei"
## [309] "Asia/Tashkent" "Asia/Tbilisi"
## [311] "Asia/Tehran" "Asia/Tel_Aviv"
## [313] "Asia/Thimbu" "Asia/Thimphu"
## [315] "Asia/Tokyo" "Asia/Ujung_Pandang"
## [317] "Asia/Ulaanbaatar" "Asia/Ulan_Bator"
## [319] "Asia/Urumqi" "Asia/Ust-Nera"
## [321] "Asia/Vientiane" "Asia/Vladivostok"
## [323] "Asia/Yakutsk" "Asia/Yekaterinburg"
## [325] "Asia/Yerevan" "Atlantic/Azores"
## [327] "Atlantic/Bermuda" "Atlantic/Canary"
## [329] "Atlantic/Cape_Verde" "Atlantic/Faeroe"
## [331] "Atlantic/Faroe" "Atlantic/Jan_Mayen"
## [333] "Atlantic/Madeira" "Atlantic/Reykjavik"
## [335] "Atlantic/South_Georgia" "Atlantic/St_Helena"
## [337] "Atlantic/Stanley" "Australia/ACT"
## [339] "Australia/Adelaide" "Australia/Brisbane"
## [341] "Australia/Broken_Hill" "Australia/Canberra"
## [343] "Australia/Currie" "Australia/Darwin"
## [345] "Australia/Eucla" "Australia/Hobart"
## [347] "Australia/LHI" "Australia/Lindeman"
## [349] "Australia/Lord_Howe" "Australia/Melbourne"
## [351] "Australia/North" "Australia/NSW"
## [353] "Australia/Perth" "Australia/Queensland"
## [355] "Australia/South" "Australia/Sydney"
## [357] "Australia/Tasmania" "Australia/Victoria"
## [359] "Australia/West" "Australia/Yancowinna"
## [361] "Brazil/Acre" "Brazil/DeNoronha"
## [363] "Brazil/East" "Brazil/West"
## [365] "Canada/Atlantic" "Canada/Central"
## [367] "Canada/East-Saskatchewan" "Canada/Eastern"
## [369] "Canada/Mountain" "Canada/Newfoundland"
## [371] "Canada/Pacific" "Canada/Saskatchewan"
## [373] "Canada/Yukon" "CET"
## [375] "Chile/Continental" "Chile/EasterIsland"
## [377] "CST6CDT" "Cuba"
## [379] "EET" "Egypt"
## [381] "Eire" "EST"
## [383] "EST5EDT" "Etc/GMT"
## [385] "Etc/GMT-0" "Etc/GMT-1"
## [387] "Etc/GMT-10" "Etc/GMT-11"
## [389] "Etc/GMT-12" "Etc/GMT-13"
## [391] "Etc/GMT-14" "Etc/GMT-2"
## [393] "Etc/GMT-3" "Etc/GMT-4"
## [395] "Etc/GMT-5" "Etc/GMT-6"
## [397] "Etc/GMT-7" "Etc/GMT-8"
## [399] "Etc/GMT-9" "Etc/GMT+0"
## [401] "Etc/GMT+1" "Etc/GMT+10"
## [403] "Etc/GMT+11" "Etc/GMT+12"
## [405] "Etc/GMT+2" "Etc/GMT+3"
## [407] "Etc/GMT+4" "Etc/GMT+5"
## [409] "Etc/GMT+6" "Etc/GMT+7"
## [411] "Etc/GMT+8" "Etc/GMT+9"
## [413] "Etc/GMT0" "Etc/Greenwich"
## [415] "Etc/UCT" "Etc/Universal"
## [417] "Etc/UTC" "Etc/Zulu"
## [419] "Europe/Amsterdam" "Europe/Andorra"
## [421] "Europe/Athens" "Europe/Belfast"
## [423] "Europe/Belgrade" "Europe/Berlin"
## [425] "Europe/Bratislava" "Europe/Brussels"
## [427] "Europe/Bucharest" "Europe/Budapest"
## [429] "Europe/Busingen" "Europe/Chisinau"
## [431] "Europe/Copenhagen" "Europe/Dublin"
## [433] "Europe/Gibraltar" "Europe/Guernsey"
## [435] "Europe/Helsinki" "Europe/Isle_of_Man"
## [437] "Europe/Istanbul" "Europe/Jersey"
## [439] "Europe/Kaliningrad" "Europe/Kiev"
## [441] "Europe/Lisbon" "Europe/Ljubljana"
## [443] "Europe/London" "Europe/Luxembourg"
## [445] "Europe/Madrid" "Europe/Malta"
## [447] "Europe/Mariehamn" "Europe/Minsk"
## [449] "Europe/Monaco" "Europe/Moscow"
## [451] "Europe/Nicosia" "Europe/Oslo"
## [453] "Europe/Paris" "Europe/Podgorica"
## [455] "Europe/Prague" "Europe/Riga"
## [457] "Europe/Rome" "Europe/Samara"
## [459] "Europe/San_Marino" "Europe/Sarajevo"
## [461] "Europe/Simferopol" "Europe/Skopje"
## [463] "Europe/Sofia" "Europe/Stockholm"
## [465] "Europe/Tallinn" "Europe/Tirane"
## [467] "Europe/Tiraspol" "Europe/Uzhgorod"
## [469] "Europe/Vaduz" "Europe/Vatican"
## [471] "Europe/Vienna" "Europe/Vilnius"
## [473] "Europe/Volgograd" "Europe/Warsaw"
## [475] "Europe/Zagreb" "Europe/Zaporozhye"
## [477] "Europe/Zurich" "GB"
## [479] "GB-Eire" "GMT"
## [481] "GMT-0" "GMT+0"
## [483] "GMT0" "Greenwich"
## [485] "Hongkong" "HST"
## [487] "Iceland" "Indian/Antananarivo"
## [489] "Indian/Chagos" "Indian/Christmas"
## [491] "Indian/Cocos" "Indian/Comoro"
## [493] "Indian/Kerguelen" "Indian/Mahe"
## [495] "Indian/Maldives" "Indian/Mauritius"
## [497] "Indian/Mayotte" "Indian/Reunion"
## [499] "Iran" "Israel"
## [501] "Jamaica" "Japan"
## [503] "Kwajalein" "Libya"
## [505] "MET" "Mexico/BajaNorte"
## [507] "Mexico/BajaSur" "Mexico/General"
## [509] "MST" "MST7MDT"
## [511] "Navajo" "NZ"
## [513] "NZ-CHAT" "Pacific/Apia"
## [515] "Pacific/Auckland" "Pacific/Bougainville"
## [517] "Pacific/Chatham" "Pacific/Chuuk"
## [519] "Pacific/Easter" "Pacific/Efate"
## [521] "Pacific/Enderbury" "Pacific/Fakaofo"
## [523] "Pacific/Fiji" "Pacific/Funafuti"
## [525] "Pacific/Galapagos" "Pacific/Gambier"
## [527] "Pacific/Guadalcanal" "Pacific/Guam"
## [529] "Pacific/Honolulu" "Pacific/Johnston"
## [531] "Pacific/Kiritimati" "Pacific/Kosrae"
## [533] "Pacific/Kwajalein" "Pacific/Majuro"
## [535] "Pacific/Marquesas" "Pacific/Midway"
## [537] "Pacific/Nauru" "Pacific/Niue"
## [539] "Pacific/Norfolk" "Pacific/Noumea"
## [541] "Pacific/Pago_Pago" "Pacific/Palau"
## [543] "Pacific/Pitcairn" "Pacific/Pohnpei"
## [545] "Pacific/Ponape" "Pacific/Port_Moresby"
## [547] "Pacific/Rarotonga" "Pacific/Saipan"
## [549] "Pacific/Samoa" "Pacific/Tahiti"
## [551] "Pacific/Tarawa" "Pacific/Tongatapu"
## [553] "Pacific/Truk" "Pacific/Wake"
## [555] "Pacific/Wallis" "Pacific/Yap"
## [557] "Poland" "Portugal"
## [559] "PRC" "PST8PDT"
## [561] "ROC" "ROK"
## [563] "Singapore" "Turkey"
## [565] "UCT" "Universal"
## [567] "US/Alaska" "US/Aleutian"
## [569] "US/Arizona" "US/Central"
## [571] "US/East-Indiana" "US/Eastern"
## [573] "US/Hawaii" "US/Indiana-Starke"
## [575] "US/Michigan" "US/Mountain"
## [577] "US/Pacific" "US/Pacific-New"
## [579] "US/Samoa" "UTC"
## [581] "VERSION" "W-SU"
## [583] "WET" "Zulu"
# From ?strptime (excerpted)
#
# ** General formats **
# %c Date and time. Locale-specific on output, "%a %b %e %H:%M:%S %Y" on input.
# %F Equivalent to %Y-%m-%d (the ISO 8601 date format).
# %T Equivalent to %H:%M:%S.
# %D Date format such as %m/%d/%y: the C99 standard says it should be that exact format
# %x Date. Locale-specific on output, "%y/%m/%d" on input.
# %X Time. Locale-specific on output, "%H:%M:%S" on input.
#
# ** Key Components **
# %y Year without century (00-99). On input, values 00 to 68 are prefixed by 20 and 69 to 99 by 19
# %Y Year with century
# %m Month as decimal number (01-12).
# %b Abbreviated month name in the current locale on this platform.
# %B Full month name in the current locale.
# %d Day of the month as decimal number (01-31).
# %e Day of the month as decimal number (1-31), with a leading space for a single-digit number.
# %a Abbreviated weekday name in the current locale on this platform.
# %A Full weekday name in the current locale.
# %H Hours as decimal number (00-23)
# %I Hours as decimal number (01-12)
# %M Minute as decimal number (00-59).
# %S Second as integer (00-61), allowing for up to two leap-seconds (but POSIX-compliant implementations will ignore leap seconds).
#
# ** Additional Options **
# %C Century (00-99): the integer part of the year divided by 100.
#
# %g The last two digits of the week-based year (see %V). (Accepted but ignored on input.)
# %G The week-based year (see %V) as a decimal number. (Accepted but ignored on input.)
#
# %h Equivalent to %b.
#
# %j Day of year as decimal number (001-366).
#
# %n Newline on output, arbitrary whitespace on input.
#
# %p AM/PM indicator in the locale. Used in conjunction with %I and not with %H. An empty string in some locales (and the behaviour is undefined if used for input in such a locale). Some platforms accept %P for output, which uses a lower-case version: others will output P.
#
# %r The 12-hour clock time (using the locale's AM or PM). Only defined in some locales.
#
# %R Equivalent to %H:%M.
#
# %t Tab on output, arbitrary whitespace on input.
#
# %u Weekday as a decimal number (1-7, Monday is 1).
#
# %U Week of the year as decimal number (00-53) using Sunday as the first day 1 of the week (and typically with the first Sunday of the year as day 1 of week 1). The US convention.
#
# %V Week of the year as decimal number (01-53) as defined in ISO 8601. If the week (starting on Monday) containing 1 January has four or more days in the new year, then it is considered week 1. Otherwise, it is the last week of the previous year, and the next week is week 1. (Accepted but ignored on input.)
#
# %w Weekday as decimal number (0-6, Sunday is 0).
#
# %W Week of the year as decimal number (00-53) using Monday as the first day of week (and typically with the first Monday of the year as day 1 of week 1). The UK convention.
#
# For input, only years 0:9999 are accepted.
#
# %z Signed offset in hours and minutes from UTC, so -0800 is 8 hours behind UTC. Values up to +1400 are accepted as from R 3.1.1: previous versions only accepted up to +1200. (Standard only for output.)
#
# %Z (Output only.) Time zone abbreviation as a character string (empty if not available). This may not be reliable when a time zone has changed abbreviations over the years.
Additionally, code from several practice examples is added:
set.seed(1608221310)
me <- 89
other_199 <- round(rnorm(199, mean=75.45, sd=11.03), 0)
mean(other_199)
## [1] 75.17588
sd(other_199)
## [1] 11.37711
desMeans <- c(72.275, 76.24, 74.5, 77.695)
desSD <- c(12.31, 11.22, 12.5, 12.53)
prevData <- c(rnorm(200, mean=72.275, sd=12.31),
rnorm(200, mean=76.24, sd=11.22),
rnorm(200, mean=74.5, sd=12.5),
rnorm(200, mean=77.695, sd=12.53)
)
previous_4 <- matrix(data=prevData, ncol=4)
curMeans <- apply(previous_4, 2, FUN=mean)
curSD <- apply(previous_4, 2, FUN=sd)
previous_4 <- t(apply(previous_4, 1, FUN=function(x) { desMeans + (desSD / curSD) * (x - curMeans) } ))
apply(round(previous_4, 0), 2, FUN=mean)
## [1] 72.285 76.245 74.505 77.665
apply(round(previous_4, 0), 2, FUN=sd)
## [1] 12.35097 11.19202 12.49643 12.51744
previous_4 <- round(previous_4, 0)
# Merge me and other_199: my_class
my_class <- c(me, other_199)
# cbind() my_class and previous_4: last_5
last_5 <- cbind(my_class, previous_4)
# Name last_5 appropriately
nms <- paste0("year_", 1:5)
colnames(last_5) <- nms
# Build histogram of my_class
hist(my_class)
# Generate summary of last_5
summary(last_5)
## year_1 year_2 year_3 year_4
## Min. : 46.00 Min. : 43.00 Min. : 38.00 Min. : 42.00
## 1st Qu.: 68.00 1st Qu.: 63.75 1st Qu.: 69.00 1st Qu.: 65.75
## Median : 75.50 Median : 73.00 Median : 76.50 Median : 74.00
## Mean : 75.25 Mean : 72.28 Mean : 76.25 Mean : 74.50
## 3rd Qu.: 83.25 3rd Qu.: 81.00 3rd Qu.: 84.25 3rd Qu.: 82.25
## Max. :108.00 Max. :108.00 Max. :102.00 Max. :113.00
## year_5
## Min. : 38.00
## 1st Qu.: 71.00
## Median : 78.00
## Mean : 77.67
## 3rd Qu.: 86.00
## Max. :117.00
# Build boxplot of last_5
boxplot(last_5)
# How many grades in your class are higher than 75?
sum(my_class > 75)
## [1] 100
# How many students in your class scored strictly higher than you?
sum(my_class > me)
## [1] 17
# What's the proportion of grades below or equal to 64 in the last 5 years?
mean(last_5 <= 64)
## [1] 0.191
# Is your grade greater than 87 and smaller than or equal to 89?
me > 87 & me <= 89
## [1] TRUE
# Which grades in your class are below 60 or above 90?
my_class < 60 | my_class > 90
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [23] TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## [34] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [45] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [56] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
## [78] FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## [89] TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE
## [100] FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
## [133] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [155] FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [166] FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [188] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [199] FALSE FALSE
# What's the proportion of grades in your class that is average?
mean(my_class >= 70 & my_class <= 85)
## [1] 0.525
# How many students in the last 5 years had a grade of 80 or 90?
sum(last_5 %in% c(80, 90))
## [1] 44
# Define n_smart
n_smart <- sum(my_class >= 80)
# Code the if-else construct
if (n_smart > 50) {
print("smart class")
} else {
print("rather average")
}
## [1] "smart class"
# Define prop_less
prop_less <- mean(my_class < me)
# Code the control construct
if (prop_less > 0.9) {
print("you're among the best 10 percent")
} else if (prop_less > 0.8) {
print("you're among the best 20 percent")
} else {
print("need more analysis")
}
## [1] "you're among the best 20 percent"
# Embedded control structure: fix the error
if (mean(my_class) < 75) {
if (mean(my_class) > me) {
print("average year, but still smarter than me")
} else {
print("average year, but I'm not that bad")
}
} else {
if (mean(my_class) > me) {
print("smart year, even smarter than me")
} else {
print("smart year, but I am smarter")
}
}
## [1] "smart year, but I am smarter"
# Create top_grades
top_grades <- my_class[my_class >= 85]
# Create worst_grades
worst_grades <- my_class[my_class < 65]
# Write conditional statement
if (length(top_grades) > length(worst_grades)) { print("top grades prevail") }
## [1] "top grades prevail"
Hadley and Charlotte Wickham led a course on writing functions in R. Broadly, the course includes advice on when/how to use functions, as well as specific advice about commands available through library(purrr).
Key pieces of advice include:
John Chambers gave a few useful slogans about functions:
Each function has three components:
Only the LAST evaluated expression is returned. The use of return() is recommended only for early-returns in a special case (for example, when a break() will be called).
Further, functions can be written anonymously on the command line, such as (function (x) {x + 1}) (1:5). A function should only depend on arguments passed to it, not variables from a parent enviornment. Every time the function is called, it receives a clean working environment. Once it finishes, its variables are no longer available unless they were returned (either by default as the last operation, or by way of return()):
# Components of a function
args(rnorm)
## function (n, mean = 0, sd = 1)
## NULL
formals(rnorm)
## $n
##
##
## $mean
## [1] 0
##
## $sd
## [1] 1
body(rnorm)
## .Call(C_rnorm, n, mean, sd)
environment(rnorm)
## <environment: namespace:stats>
# What is passed back
funDummy <- function(x) {
if (x <= 2) {
print("That is too small")
return(3) # This ends the function by convention
}
ceiling(x) # This is the defaulted return() value if nothing happened to prevent the code getting here
}
funDummy(1)
## [1] "That is too small"
## [1] 3
funDummy(5)
## [1] 5
# Anonymous functions
(function (x) {x + 1}) (1:5)
## [1] 2 3 4 5 6
The course includes some insightful discussion of vectors. As it happens, lists and data frames are just special collections of vectors in R. Each column of a data frame is a vector, while each element of a list is either 1) an embedded data frame (which is eventually a vector by way of columns), 2) an embedded list (which is eventually a vector by way of recursion), or 3) an actual vector.
The atomic vectors are of types logical, integer, character, and double; complex and raw are rarer types that are also available. Lists are just recursive vectors, which is to say that lists can contain other lists and can be hetergeneous. To explore vectors, you have:
Note that NULL is the absence of a vector and has length 0. NA is the absence of an element in the vector and has length 1. All math operations with NA return NA; for example NA == NA will return NA.
There are some good tips on extracting element from a list:
# Data types
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
typeof(mtcars) # n.b. that this is technically a "list"
## [1] "list"
length(mtcars)
## [1] 11
# NULL and NA
length(NULL)
## [1] 0
typeof(NULL)
## [1] "NULL"
length(NA)
## [1] 1
typeof(NA)
## [1] "logical"
NULL == NULL
## logical(0)
NULL == NA
## logical(0)
NA == NA
## [1] NA
is.null(NULL)
## [1] TRUE
is.null(NA)
## [1] FALSE
is.na(NULL)
## Warning in is.na(NULL): is.na() applied to non-(list or vector) of type
## 'NULL'
## logical(0)
is.na(NA)
## [1] TRUE
# Extraction
mtcars[["mpg"]][1:5]
## [1] 21.0 21.0 22.8 21.4 18.7
mtcars[[2]][1:5]
## [1] 6 6 4 6 8
mtcars$hp[1:5]
## [1] 110 110 93 110 175
# Relevant lengths
seq_along(mtcars)
## [1] 1 2 3 4 5 6 7 8 9 10 11
x <- data.frame()
seq_along(x)
## integer(0)
length(seq_along(x))
## [1] 0
foo <- function(x) { for (eachCol in seq_along(x)) { print(typeof(x[[eachCol]])) }}
foo(mtcars)
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
foo(x) # Note that this does nothing!
data(airquality)
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
foo(airquality)
## [1] "integer"
## [1] "integer"
## [1] "double"
## [1] "integer"
## [1] "integer"
## [1] "integer"
# Range command
mpgRange <- range(mtcars$mpg)
mpgRange
## [1] 10.4 33.9
mpgScale <- (mtcars$mpg - mpgRange[1]) / (mpgRange[2] - mpgRange[1])
summary(mpgScale)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2138 0.3745 0.4124 0.5277 1.0000
The typical arguments in a function use a consistent, simple naming function:
Data arguments should come before detail arguments, and detail arguments should be given reasonable default values. See for example rnorm(n, mean=0, sd=1). The number requested (n) must be specified, but defaults are available for the details (mean and standard deviation).
Functions can be passed as arguments to other functions, which is at the core of functional programming. For example:
do_math <- function(x, fun) { fun(x) }
do_math(1:10, fun=mean)
## [1] 5.5
do_math(1:10, fun=sd)
## [1] 3.02765
The library(purrr) takes advantage of this, and in a type-consistent manner. There are functions for:
The general arguments are .x (a list or an atomic vector) and .f which can be either a function, an anonymous function (formula with ~), or an extractor .x[[.f]]. For example:
library(purrr)
## Warning: package 'purrr' was built under R version 3.2.5
library(RColorBrewer) # Need to have in non-cached chunk for later
data(mtcars)
# Create output as a list
map(.x=mtcars, .f=sum)
## $mpg
## [1] 642.9
##
## $cyl
## [1] 198
##
## $disp
## [1] 7383.1
##
## $hp
## [1] 4694
##
## $drat
## [1] 115.09
##
## $wt
## [1] 102.952
##
## $qsec
## [1] 571.16
##
## $vs
## [1] 14
##
## $am
## [1] 13
##
## $gear
## [1] 118
##
## $carb
## [1] 90
# Create same output as a double
map_dbl(.x=mtcars, .f=sum)
## mpg cyl disp hp drat wt qsec vs
## 642.900 198.000 7383.100 4694.000 115.090 102.952 571.160 14.000
## am gear carb
## 13.000 118.000 90.000
# Create same output as integer
# map_int(.x=mtcars, .f=sum) . . . this would bomb since it is not actually an integere
map_int(.x=mtcars, .f=function(x) { as.integer(round(sum(x), 0)) } )
## mpg cyl disp hp drat wt qsec vs am gear carb
## 643 198 7383 4694 115 103 571 14 13 118 90
# Same thing but using an anonymous function with ~ and .
map_int(.x=mtcars, .f = ~ as.integer(round(sum(.), 0)) )
## mpg cyl disp hp drat wt qsec vs am gear carb
## 643 198 7383 4694 115 103 571 14 13 118 90
# Create a boolean vector
map_lgl(.x=mtcars, .f = ~ ifelse(sum(.) > 200, TRUE, FALSE) )
## mpg cyl disp hp drat wt qsec vs am gear carb
## TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
# Create a character vector
map_chr(.x=mtcars, .f = ~ ifelse(sum(.) > 200, "Large", "Not So Large") )
## mpg cyl disp hp drat
## "Large" "Not So Large" "Large" "Large" "Not So Large"
## wt qsec vs am gear
## "Not So Large" "Large" "Not So Large" "Not So Large" "Not So Large"
## carb
## "Not So Large"
# Use the extractor [pulls the first row]
map_dbl(.x=mtcars, .f=1)
## mpg cyl disp hp drat wt qsec vs am gear
## 21.00 6.00 160.00 110.00 3.90 2.62 16.46 0.00 1.00 4.00
## carb
## 4.00
# Example from help file using chaining
mtcars %>%
split(.$cyl) %>%
map(~ lm(mpg ~ wt, data = .x)) %>%
map(summary) %>%
map_dbl("r.squared")
## 4 6 8
## 0.5086326 0.4645102 0.4229655
# Using sapply
sapply(split(mtcars, mtcars$cyl), FUN=function(.x) { summary(lm(mpg ~ wt, data=.x))$r.squared } )
## 4 6 8
## 0.5086326 0.4645102 0.4229655
# Use the extractor from a list
cylSplit <- split(mtcars, mtcars$cyl)
map(cylSplit, "mpg")
## $`4`
## [1] 22.8 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26.0 30.4 21.4
##
## $`6`
## [1] 21.0 21.0 21.4 18.1 19.2 17.8 19.7
##
## $`8`
## [1] 18.7 14.3 16.4 17.3 15.2 10.4 10.4 14.7 15.5 15.2 13.3 19.2 15.8 15.0
map(cylSplit, "cyl")
## $`4`
## [1] 4 4 4 4 4 4 4 4 4 4 4
##
## $`6`
## [1] 6 6 6 6 6 6 6
##
## $`8`
## [1] 8 8 8 8 8 8 8 8 8 8 8 8 8 8
The purrr library has several additional interesting functions:
Some example code includes:
library(purrr) # Called again for clarity; all these key functions belong to purrr
# safely(.f, otherwise = NULL, quiet = TRUE)
safe_log10 <- safely(log10)
map(list(0, 1, 10, "a"), .f=safe_log10)
## [[1]]
## [[1]]$result
## [1] -Inf
##
## [[1]]$error
## NULL
##
##
## [[2]]
## [[2]]$result
## [1] 0
##
## [[2]]$error
## NULL
##
##
## [[3]]
## [[3]]$result
## [1] 1
##
## [[3]]$error
## NULL
##
##
## [[4]]
## [[4]]$result
## NULL
##
## [[4]]$error
## <simpleError in .f(...): non-numeric argument to mathematical function>
# possibly(.f, otherwise, quiet = TRUE)
poss_log10 <- possibly(log10, otherwise=NaN)
map_dbl(list(0, 1, 10, "a"), .f=poss_log10)
## [1] -Inf 0 1 NaN
# transpose() - note that this can become masked by data.table::transpose() so be careful
purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))
## $result
## $result[[1]]
## [1] -Inf
##
## $result[[2]]
## [1] 0
##
## $result[[3]]
## [1] 1
##
## $result[[4]]
## NULL
##
##
## $error
## $error[[1]]
## NULL
##
## $error[[2]]
## NULL
##
## $error[[3]]
## NULL
##
## $error[[4]]
## <simpleError in .f(...): non-numeric argument to mathematical function>
purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$result
## [[1]]
## [1] -Inf
##
## [[2]]
## [1] 0
##
## [[3]]
## [1] 1
##
## [[4]]
## NULL
unlist(purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$result)
## [1] -Inf 0 1
purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$error
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## <simpleError in .f(...): non-numeric argument to mathematical function>
map_lgl(purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$error, is.null)
## [1] TRUE TRUE TRUE FALSE
# map2(.x, .y, .f)
map2(list(5, 10, 20), list(1, 2, 3), .f=rnorm) # rnorm(5, 1), rnorm(10, 2), and rnorm(20, 3)
## [[1]]
## [1] 0.41176421 2.00652288 0.06152025 0.46963873 1.15436157
##
## [[2]]
## [1] 0.006821057 2.902712636 1.436150816 1.377836302 2.625075832
## [6] 0.680797806 0.313499192 0.718062969 2.820989906 3.134207742
##
## [[3]]
## [1] 3.3716474 2.9393673 1.8648940 3.2343343 2.1849894 2.0697179 1.0872014
## [8] 3.4970403 3.5769694 3.0999340 1.2033341 0.9839011 2.9820314 1.7116383
## [15] 0.8779558 1.6990118 2.5914013 2.3587803 3.7460957 1.2980312
# pmap(.l, .f)
pmap(list(n=list(5, 10, 20), mean=list(1, 5, 10), sd=list(0.1, 0.5, 0.1)), rnorm)
## [[1]]
## [1] 1.0151570 1.1573287 1.0628581 0.8805484 0.9418430
##
## [[2]]
## [1] 5.032920 4.689799 5.423525 5.265610 4.727383 5.252325 5.166292
## [8] 4.861745 5.135408 4.106679
##
## [[3]]
## [1] 9.854138 10.090939 10.045554 9.970755 10.092487 9.769531 10.140064
## [8] 9.834716 10.196817 10.047367 10.054093 10.006439 10.142002 10.092259
## [15] 10.222459 10.082440 10.067818 9.993884 10.078380 9.936942
# invoke_map(.f, .x, ...)
invoke_map(list(rnorm, runif, rexp), n=5)
## [[1]]
## [1] -0.96707137 0.08207476 1.39498168 0.60287972 -0.15130461
##
## [[2]]
## [1] 0.01087442 0.02980483 0.81443586 0.88438198 0.67976034
##
## [[3]]
## [1] 0.2646751 1.3233260 1.1079261 1.3504952 0.6795524
# walk() is for the side effects of a function
x <- list(1, "\n\ta\n", 3)
x %>% walk(cat)
## 1
## a
## 3
# Chaining is available by way of the %>% operator
pretty_titles <- c("N(0, 1)", "Uniform(0, 1)", "Exponential (rate=1)")
set.seed(1607120947)
x <- invoke_map(list(rnorm, runif, rexp), n=5000)
foo <- function(x) { map(x, .f=summary) }
par(mfrow=c(1, 3))
pwalk(list(x=x, main=pretty_titles), .f=hist, xlab="", col="light blue") %>% map(.f=foo)
## $x
## $x[[1]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.711000 -0.637800 -0.000217 0.006543 0.671800 3.633000
##
## $x[[2]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0001241 0.2518000 0.5012000 0.5028000 0.7566000 0.9999000
##
## $x[[3]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00001 0.29140 0.68340 0.98260 1.37900 8.46300
##
##
## $main
## $main[[1]]
## Length Class Mode
## 1 character character
##
## $main[[2]]
## Length Class Mode
## 1 character character
##
## $main[[3]]
## Length Class Mode
## 1 character character
par(mfrow=c(1, 1))
There are two potentially desirable behaviors with functions:
As a best practice, R functions that will be used for programming (as opposed to interactive command line work) should be written in a robust manner. Three standard problems should be avoided/mitigated:
There are several methods available for throwing errors within an R function:
One example that commonly creates surprises is the [,] operator for extraction. Adding [ , , drop=FALSE] ensures that you will still have what you passed (e.g., a matrix or data frame) rather than conversion of a chunk of data to a vector.
Another common source of error is sapply() which will return a vector when it can and a list otherwise. The map() and map_typ() functions in purrr are designed to be type-stable; if the output is not as expected, they will error out.
Non-standard evaluations take advantage of the existence of something else (e.g., a variable in the parent environment that has not been passed). This can cause confusion and improper results.
Pure functions have the key properties that 1) their output depends only on their inputs, and 2) they do not impact the outside world other than by way of their return value. Specifically, the function should not depend on how the user has configured their global options as shown in options(), nor should it modify those options() settings upon return of control to the parent environment.
A few examples are shown below:
# Throwing errors to stop a function (cannot actually run these!)
# stopifnot(FALSE)
# if (FALSE) { stop("Error: ", call.=FALSE) }
# if (FALSE) { stop("Error: This condition needed to be set as TRUE", call.=FALSE) }
# Behavior of [,] and [,,drop=FALSE]
mtxTest <- matrix(data=1:9, nrow=3, byrow=TRUE)
class(mtxTest)
## [1] "matrix"
mtxTest[1, ]
## [1] 1 2 3
class(mtxTest[1, ])
## [1] "integer"
mtxTest[1, , drop=FALSE]
## [,1] [,2] [,3]
## [1,] 1 2 3
class(mtxTest[1, , drop=FALSE])
## [1] "matrix"
# Behavior of sapply() - may not get what you are expecting
foo <- function(x) { x^2 }
sapply(1:5, FUN=foo)
## [1] 1 4 9 16 25
class(sapply(1:5, FUN=foo))
## [1] "numeric"
sapply(c(1, list(1.5, 2, 2.5), 3, 4, 5), FUN=foo)
## [1] 1.00 2.25 4.00 6.25 9.00 16.00 25.00
class(sapply(c(1, list(1.5, 2, 2.5), 3, 4, 5), FUN=foo))
## [1] "numeric"
sapply(list(1, c(1.5, 2, 2.5), 3, 4, 5), FUN=foo)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 2.25 4.00 6.25
##
## [[3]]
## [1] 9
##
## [[4]]
## [1] 16
##
## [[5]]
## [1] 25
class(sapply(list(1, c(1.5, 2, 2.5), 3, 4, 5), FUN=foo))
## [1] "list"
This was a very enjoyable and instructive course.
This course provides an overview of loading data in to R from five main sources:
At the most basic level, the utlis library easily handles reading most types of flat files:
There are also European equivalents in case the decimal needs to be set as “,” to read in the file:
The file.path() command is a nice way to put together file paths. It is more or less equivalent to paste(, sep=“/”), but with the benefit that sep is machine/operating-system dependent, so it may be easier to use across platforms.
Further, there is the option to use colClasses() to specify the type in each column, with NULL meaning do not import. Abbreviations can be used for these as well:
# colClasses (relevant abbreviations)
R.utils::colClasses("-?cdfilnrzDP")
## [1] "NULL" "NA" "character" "double" "factor"
## [6] "integer" "logical" "numeric" "raw" "complex"
## [11] "Date" "POSIXct"
# file.path example
file.path("..", "myplot.pdf")
## [1] "../myplot.pdf"
# Key documentation for reading flat files
#
# read.table(file, header = FALSE, sep = "", quote = "\"'",
# dec = ".", numerals = c("allow.loss", "warn.loss", "no.loss"),
# row.names, col.names, as.is = !stringsAsFactors,
# na.strings = "NA", colClasses = NA, nrows = -1,
# skip = 0, check.names = TRUE, fill = !blank.lines.skip,
# strip.white = FALSE, blank.lines.skip = TRUE,
# comment.char = "#",
# allowEscapes = FALSE, flush = FALSE,
# stringsAsFactors = default.stringsAsFactors(),
# fileEncoding = "", encoding = "unknown", text, skipNul = FALSE)
#
# read.csv(file, header = TRUE, sep = ",", quote = "\"",
# dec = ".", fill = TRUE, comment.char = "", ...)
#
# read.csv2(file, header = TRUE, sep = ";", quote = "\"",
# dec = ",", fill = TRUE, comment.char = "", ...)
#
# read.delim(file, header = TRUE, sep = "\t", quote = "\"",
# dec = ".", fill = TRUE, comment.char = "", ...)
#
# read.delim2(file, header = TRUE, sep = "\t", quote = "\"",
# dec = ",", fill = TRUE, comment.char = "", ...)
There are also two libraries that can be especially helpful for reading in flat files - readr and data.table.
read_tsv is for tab-separated values
Further, the library(readxl) is handy for loading Excel sheets:
R can also load files from common statistical software such as SAS, STATA, SPSS, and MATLAB/Octave. The packages haven() by Wickham and foreign() by the R core team are two common examples. The R.matlab() allows for reading to/from MATLAB/Octave:
The library(haven) contains wrappers to the ReadStat package, a C library by Evan Miller, for reading files from SAS, STATA, and SPSS:
The library(foreign) can read/write all types of foreign formats, with some caveats:
Finally, the R.matlab() library is available for reading/writing MATLAB/Octave files. Per the help file:
Methods readMat() and writeMat() for reading and writing MAT files. For user with MATLAB v6 or newer installed (either locally or on a remote host), the package also provides methods for controlling MATLAB (trademark) via R and sending and retrieving data between R and MATLAB.
In brief, this package provides a one-directional interface from R to MATLAB, with communication taking place via a TCP/IP connection and with data transferred either through another connection or via the file system. On the MATLAB side, the TCP/IP connection is handled by a small Java add-on.
The methods for reading and writing MAT files are stable. The R to MATLAB interface, that is the Matlab class, is less prioritized and should be considered a beta version.
Relational databases in R (DBMS tend to use SQL for queries), including libraries:
Conventions are specified in DBI; see library(DBI):
Create the connection as “con” (or whatever) and then use that elsewhere:
When finished, dbDisconnect(con) as a courtesy so as to not tie up resources.
SQL queries from inside R - per previous, library(DBI) and then create the connection “con”:
For example, using “./SQLforDataCampRMD_v01.db”, run a few SQL commands:
# uses libraries DBI for the connection and RSQLite to interface with SQLite Browser on my machine
con <- DBI::dbConnect(RSQLite::SQLite(), "SQLforDataCampRMD_v01.db")
# List the tables, and drop dummy if it already exists
DBI::dbListTables(con)
## [1] "dummy"
DBI::dbGetQuery(con, "DROP TABLE IF EXISTS dummy")
# Create blank table
DBI::dbListTables(con)
## character(0)
DBI::dbGetQuery(con, "CREATE TABLE IF NOT EXISTS dummy (id PRIMARY KEY, name CHAR)")
DBI::dbGetQuery(con, "INSERT OR IGNORE INTO dummy (id, name) VALUES (1, 'Amy')")
DBI::dbGetQuery(con, "INSERT OR IGNORE INTO dummy (id, name) VALUES (2, 'Bill')")
DBI::dbGetQuery(con, "INSERT OR IGNORE INTO dummy (id, name) VALUES (2, 'Jen')") # Should do nothing
DBI::dbGetQuery(con, "SELECT * FROM dummy")
## id name
## 1 1 Amy
## 2 2 Bill
DBI::dbListTables(con)
## [1] "dummy"
# Can continue passing SQL commands back and forth as needed
# Close the connection
DBI::dbDisconnect(con)
## [1] TRUE
Many of the R read-in libraries already work well with web data. For example, read.csv(“mywebsite.com”, stringAsFactors=FALSE) will read a CSV right off the internet. Further, there are options for:
The jsonlite library is good for working with JSON:
Prettify adds indentation to a JSON string; minify removes all indentation/whitespace:
jsonLoc <- file.path("../../..", "PythonDirectory", "UMModule04", "roster_data.json")
jsonData <- jsonlite::fromJSON(jsonLoc)
str(jsonData)
## chr [1:379, 1:3] "Calvin" "Wilson" "Emi" "Rosina" "Sylvie" ...
head(jsonData)
## [,1] [,2] [,3]
## [1,] "Calvin" "si110" "1"
## [2,] "Wilson" "si110" "0"
## [3,] "Emi" "si110" "0"
## [4,] "Rosina" "si110" "0"
## [5,] "Sylvie" "si110" "0"
## [6,] "Katarzyna" "si110" "0"
The general analysis pipeline is Collect -> Clean -> Analyze -> Report. Cleaning is needed so the raw data can work with more traditional tools (e.g., packages in Python or R). 50% - 80% of time is spent in the Collect/Clean realm, even though this is not the most exciting (and thus taught) part of data analysis. There are generally three stages of data cleaning: Explore -> Tidy -> Prepare
Exploring the Data:
Viewing the Data:
Tidy data - Wickham 2014, Principles of Tidy Data:
The principles of tidy data can be implemented using library(tidyr):
Common symptoms of messy data include:
Example code includes:
# tidyr::gather()
stocks <- data.frame(time = as.Date('2009-01-01') + 0:4,
X = rnorm(5, 0, 1), Y = rnorm(5, 0, 2), Z = rnorm(5, 0, 4)
)
stocks
## time X Y Z
## 1 2009-01-01 1.64736472 -0.1020457 -8.074672
## 2 2009-01-02 0.32981671 -0.2377234 7.617473
## 3 2009-01-03 0.05010405 -0.7091054 -9.770047
## 4 2009-01-04 0.41187479 1.1899260 -1.655071
## 5 2009-01-05 -2.20625659 -1.1299452 1.615068
# will create new columns stock (each of X, Y, Z) and price (the values that had been in X, Y, and Z),
# while not gathering the time variable; final table will be time-stock-price
stockGather <- tidyr::gather(stocks, stock, price, -time)
stockGather
## time stock price
## 1 2009-01-01 X 1.64736472
## 2 2009-01-02 X 0.32981671
## 3 2009-01-03 X 0.05010405
## 4 2009-01-04 X 0.41187479
## 5 2009-01-05 X -2.20625659
## 6 2009-01-01 Y -0.10204566
## 7 2009-01-02 Y -0.23772338
## 8 2009-01-03 Y -0.70910541
## 9 2009-01-04 Y 1.18992602
## 10 2009-01-05 Y -1.12994523
## 11 2009-01-01 Z -8.07467238
## 12 2009-01-02 Z 7.61747283
## 13 2009-01-03 Z -9.77004663
## 14 2009-01-04 Z -1.65507149
## 15 2009-01-05 Z 1.61506772
# tidyr::spread()
tidyr::spread(stockGather, stock, price)
## time X Y Z
## 1 2009-01-01 1.64736472 -0.1020457 -8.074672
## 2 2009-01-02 0.32981671 -0.2377234 7.617473
## 3 2009-01-03 0.05010405 -0.7091054 -9.770047
## 4 2009-01-04 0.41187479 1.1899260 -1.655071
## 5 2009-01-05 -2.20625659 -1.1299452 1.615068
# TRUE (this fully reverses what the gather function has done)
identical(tidyr::spread(stockGather, stock, price), stocks)
## [1] TRUE
# tidyr::separate()
df <- data.frame(x = c(NA, "a.b", "a.d", "b.c"))
df
## x
## 1 <NA>
## 2 a.b
## 3 a.d
## 4 b.c
# by default, the splits occur on anything that is not alphanumeric,
# so you get column A as whatever is before the dot and column B as whatever is after the dot
dfSep <- tidyr::separate(df, x, c("A", "B"))
dfSep
## A B
## 1 <NA> <NA>
## 2 a b
## 3 a d
## 4 b c
# tidyr::unite()
tidyr::unite(dfSep, united, c(A, B), sep="")
## united
## 1 NANA
## 2 ab
## 3 ad
## 4 bc
is.na(dfSep) # caution . . .
## A B
## 1 TRUE TRUE
## 2 FALSE FALSE
## 3 FALSE FALSE
## 4 FALSE FALSE
is.na(tidyr::unite(dfSep, united, c(A, B), sep="")) # caution . . .
## united
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
The tolower() and toupper() commands can be very useful also
Example code includes:
# lubridate::ymd()
lubridate::ymd("160720")
## [1] "2016-07-20 UTC"
lubridate::ymd("2016-7-20")
## [1] "2016-07-20 UTC"
lubridate::ymd("16jul20")
## [1] "2016-07-20 UTC"
lubridate::ymd("16/07/20")
## [1] "2016-07-20 UTC"
# lubridate::hms()
lubridate::hms("07h15:00")
## [1] "7H 15M 0S"
lubridate::hms("17 hours, 15 minutes 00 seconds")
## [1] "17H 15M 0S"
lubridate::hms("07-15-00")
## [1] "7H 15M 0S"
# From ?stringr::str_detect
#
# str_detect(string, pattern)
# string Input vector. Either a character vector, or something coercible to one.
# pattern Pattern to look for. The default interpretation is a regular expression, as described in stringi-search-regex. Control options with regex(). Match a fixed string (i.e. by comparing only bytes), using fixed(x). This is fast, but approximate. Generally, for matching human text, you'll want coll(x) which respects character matching rules for the specified locale. Match character, word, line and sentence boundaries with boundary(). An empty pattern, "", is equivalent to boundary("character").
#
fruit <- c("apple", "banana", "pear", "pinapple")
stringr::str_detect(fruit, "a")
## [1] TRUE TRUE TRUE TRUE
stringr::str_detect(fruit, "^a")
## [1] TRUE FALSE FALSE FALSE
stringr::str_detect(fruit, "a$")
## [1] FALSE TRUE FALSE FALSE
stringr::str_detect(fruit, "b")
## [1] FALSE TRUE FALSE FALSE
stringr::str_detect(fruit, "[aeiou]")
## [1] TRUE TRUE TRUE TRUE
# Also vectorised over pattern
stringr::str_detect("aecfg", letters)
## [1] TRUE FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE
# From ?stringr::str_replace
#
# str_replace(string, pattern, replacement)
# str_replace_all(string, pattern, replacement)
# string Input vector. Either a character vector, or something coercible to one.
# pattern, replacement Supply separate pattern and replacement strings to vectorise over the patterns. References of the form \1, \2 will be replaced with the contents of the respective matched group (created by ()) within the pattern. For str_replace_all only, you can perform multiple patterns and replacements to each string, by passing a named character to pattern.
#
someNA <- c(letters, "", LETTERS, "")
someNA[someNA==""] <- NA
someNA
## [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z" NA "A" "B" "C" "D" "E" "F" "G"
## [35] "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R" "S" "T" "U" "V" "W" "X"
## [52] "Y" "Z" NA
fruits <- c("one apple", "two pears", "three bananas")
stringr::str_replace(fruits, "[aeiou]", "-") # Replace FIRST instance
## [1] "-ne apple" "tw- pears" "thr-e bananas"
stringr::str_replace_all(fruits, "[aeiou]", "-") # Replace ALL instances
## [1] "-n- -ppl-" "tw- p--rs" "thr-- b-n-n-s"
stringr::str_replace(fruits, "([aeiou])", "\\1\\1\\1") # Triple up on the first vowel
## [1] "ooone apple" "twooo pears" "threeee bananas"
stringr::str_replace(fruits, "[aeiou]", c("1", "2", "3")) # First vowel to 1, 2, 3 in word 1, 2, 3
## [1] "1ne apple" "tw2 pears" "thr3e bananas"
stringr::str_replace(fruits, c("a", "e", "i"), "-") # First a -> - in word 1, first e -> - in word 2 . . .
## [1] "one -pple" "two p-ars" "three bananas"
stringr::str_replace_all(fruits, "([aeiou])", "\\1\\1") # Double up on all vowels
## [1] "oonee aapplee" "twoo peeaars" "threeee baanaanaas"
stringr::str_replace_all(fruits, "[aeiou]", c("1", "2", "3")) # All vowels to 1, 2, 3, in word 1, 2, 3
## [1] "1n1 1ppl1" "tw2 p22rs" "thr33 b3n3n3s"
stringr::str_replace_all(fruits, c("a", "e", "i"), "-") # All a -> - in word 1, . . .
## [1] "one -pple" "two p-ars" "three bananas"
Further, the outline from the weather gathering data cleaning challenge is noted:
The library(dplyr) is a grammar of data manipulation. It is written in C++ so you get the speed of C with the convenience of R. It is in essence the data frame to data frame portion of plyr (plyr was the original Split-Apply-Combine). May want to look in to count, transmute, and other verbs added post this summary.
The examples use data(hflights) from library(hflights):
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:purrr':
##
## order_by
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(hflights)
data(hflights)
head(hflights)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## 5427 428 N403AA 70 39 3 3 IAH
## 5428 428 N492AA 62 44 -3 5 IAH
## 5429 428 N262AA 64 45 -7 -1 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
## 5427 DFW 224 9 22 0 0
## 5428 DFW 224 9 9 0 0
## 5429 DFW 224 6 13 0 0
summary(hflights)
## Year Month DayofMonth DayOfWeek
## Min. :2011 Min. : 1.000 Min. : 1.00 Min. :1.000
## 1st Qu.:2011 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.000
## Median :2011 Median : 7.000 Median :16.00 Median :4.000
## Mean :2011 Mean : 6.514 Mean :15.74 Mean :3.948
## 3rd Qu.:2011 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:6.000
## Max. :2011 Max. :12.000 Max. :31.00 Max. :7.000
##
## DepTime ArrTime UniqueCarrier FlightNum
## Min. : 1 Min. : 1 Length:227496 Min. : 1
## 1st Qu.:1021 1st Qu.:1215 Class :character 1st Qu.: 855
## Median :1416 Median :1617 Mode :character Median :1696
## Mean :1396 Mean :1578 Mean :1962
## 3rd Qu.:1801 3rd Qu.:1953 3rd Qu.:2755
## Max. :2400 Max. :2400 Max. :7290
## NA's :2905 NA's :3066
## TailNum ActualElapsedTime AirTime ArrDelay
## Length:227496 Min. : 34.0 Min. : 11.0 Min. :-70.000
## Class :character 1st Qu.: 77.0 1st Qu.: 58.0 1st Qu.: -8.000
## Mode :character Median :128.0 Median :107.0 Median : 0.000
## Mean :129.3 Mean :108.1 Mean : 7.094
## 3rd Qu.:165.0 3rd Qu.:141.0 3rd Qu.: 11.000
## Max. :575.0 Max. :549.0 Max. :978.000
## NA's :3622 NA's :3622 NA's :3622
## DepDelay Origin Dest Distance
## Min. :-33.000 Length:227496 Length:227496 Min. : 79.0
## 1st Qu.: -3.000 Class :character Class :character 1st Qu.: 376.0
## Median : 0.000 Mode :character Mode :character Median : 809.0
## Mean : 9.445 Mean : 787.8
## 3rd Qu.: 9.000 3rd Qu.:1042.0
## Max. :981.000 Max. :3904.0
## NA's :2905
## TaxiIn TaxiOut Cancelled CancellationCode
## Min. : 1.000 Min. : 1.00 Min. :0.00000 Length:227496
## 1st Qu.: 4.000 1st Qu.: 10.00 1st Qu.:0.00000 Class :character
## Median : 5.000 Median : 14.00 Median :0.00000 Mode :character
## Mean : 6.099 Mean : 15.09 Mean :0.01307
## 3rd Qu.: 7.000 3rd Qu.: 18.00 3rd Qu.:0.00000
## Max. :165.000 Max. :163.00 Max. :1.00000
## NA's :3066 NA's :2947
## Diverted
## Min. :0.000000
## 1st Qu.:0.000000
## Median :0.000000
## Mean :0.002853
## 3rd Qu.:0.000000
## Max. :1.000000
##
The “tbl” is a special type of data frame, which is very helpful for printing:
An interesting way to do a lookup table:
See for example:
lut <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental",
"DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways",
"WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier",
"FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa"
)
hflights$Carrier <- lut[hflights$UniqueCarrier]
glimpse(hflights)
## Observations: 227,496
## Variables: 22
## $ Year (int) 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek (int) 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime (int) 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime (int) 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier (chr) "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum (int) 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum (chr) "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime (int) 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime (int) 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay (int) -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay (int) 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin (chr) "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest (chr) "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance (int) 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn (int) 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut (int) 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode (chr) "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Carrier (chr) "American", "American", "American", "America...
There are five main verbs in dplyr:
There is also the group_by capability for summaries of sub-groups:
The dplyr library can also work with databases. It only loads the data that you need, and you do not need to know the relevant SQL code – dplyr writes the SQL code for you.
Basic select and mutate examples include:
data(hflights)
# Make it faster, as well as a prettier printer
hflights <- tbl_df(hflights)
hflights
## Source: local data frame [227,496 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
class(hflights)
## [1] "tbl_df" "tbl" "data.frame"
# Select examples
select(hflights, ActualElapsedTime, AirTime, ArrDelay, DepDelay)
## Source: local data frame [227,496 x 4]
##
## ActualElapsedTime AirTime ArrDelay DepDelay
## (int) (int) (int) (int)
## 1 60 40 -10 0
## 2 60 45 -9 1
## 3 70 48 -8 -8
## 4 70 39 3 3
## 5 62 44 -3 5
## 6 64 45 -7 -1
## 7 70 43 -1 -1
## 8 59 40 -16 -5
## 9 71 41 44 43
## 10 70 45 43 43
## .. ... ... ... ...
select(hflights, Origin:Cancelled)
## Source: local data frame [227,496 x 6]
##
## Origin Dest Distance TaxiIn TaxiOut Cancelled
## (chr) (chr) (int) (int) (int) (int)
## 1 IAH DFW 224 7 13 0
## 2 IAH DFW 224 6 9 0
## 3 IAH DFW 224 5 17 0
## 4 IAH DFW 224 9 22 0
## 5 IAH DFW 224 9 9 0
## 6 IAH DFW 224 6 13 0
## 7 IAH DFW 224 12 15 0
## 8 IAH DFW 224 7 12 0
## 9 IAH DFW 224 8 22 0
## 10 IAH DFW 224 6 19 0
## .. ... ... ... ... ... ...
select(hflights, Year:DayOfWeek, ArrDelay:Diverted)
## Source: local data frame [227,496 x 14]
##
## Year Month DayofMonth DayOfWeek ArrDelay DepDelay Origin Dest
## (int) (int) (int) (int) (int) (int) (chr) (chr)
## 1 2011 1 1 6 -10 0 IAH DFW
## 2 2011 1 2 7 -9 1 IAH DFW
## 3 2011 1 3 1 -8 -8 IAH DFW
## 4 2011 1 4 2 3 3 IAH DFW
## 5 2011 1 5 3 -3 5 IAH DFW
## 6 2011 1 6 4 -7 -1 IAH DFW
## 7 2011 1 7 5 -1 -1 IAH DFW
## 8 2011 1 8 6 -16 -5 IAH DFW
## 9 2011 1 9 7 44 43 IAH DFW
## 10 2011 1 10 1 43 43 IAH DFW
## .. ... ... ... ... ... ... ... ...
## Variables not shown: Distance (int), TaxiIn (int), TaxiOut (int),
## Cancelled (int), CancellationCode (chr), Diverted (int)
select(hflights, ends_with("Delay"))
## Source: local data frame [227,496 x 2]
##
## ArrDelay DepDelay
## (int) (int)
## 1 -10 0
## 2 -9 1
## 3 -8 -8
## 4 3 3
## 5 -3 5
## 6 -7 -1
## 7 -1 -1
## 8 -16 -5
## 9 44 43
## 10 43 43
## .. ... ...
select(hflights, UniqueCarrier, ends_with("Num"), starts_with("Cancel"))
## Source: local data frame [227,496 x 5]
##
## UniqueCarrier FlightNum TailNum Cancelled CancellationCode
## (chr) (int) (chr) (int) (chr)
## 1 AA 428 N576AA 0
## 2 AA 428 N557AA 0
## 3 AA 428 N541AA 0
## 4 AA 428 N403AA 0
## 5 AA 428 N492AA 0
## 6 AA 428 N262AA 0
## 7 AA 428 N493AA 0
## 8 AA 428 N477AA 0
## 9 AA 428 N476AA 0
## 10 AA 428 N504AA 0
## .. ... ... ... ... ...
select(hflights, ends_with("Time"), ends_with("Delay"))
## Source: local data frame [227,496 x 6]
##
## DepTime ArrTime ActualElapsedTime AirTime ArrDelay DepDelay
## (int) (int) (int) (int) (int) (int)
## 1 1400 1500 60 40 -10 0
## 2 1401 1501 60 45 -9 1
## 3 1352 1502 70 48 -8 -8
## 4 1403 1513 70 39 3 3
## 5 1405 1507 62 44 -3 5
## 6 1359 1503 64 45 -7 -1
## 7 1359 1509 70 43 -1 -1
## 8 1355 1454 59 40 -16 -5
## 9 1443 1554 71 41 44 43
## 10 1443 1553 70 45 43 43
## .. ... ... ... ... ... ...
# Mutate example
m1 <- mutate(hflights, loss = ArrDelay - DepDelay, loss_ratio = loss / DepDelay)
class(m1)
## [1] "tbl_df" "tbl" "data.frame"
m1
## Source: local data frame [227,496 x 23]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int), loss (int), loss_ratio (dbl)
glimpse(m1)
## Observations: 227,496
## Variables: 23
## $ Year (int) 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek (int) 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime (int) 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime (int) 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier (chr) "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum (int) 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum (chr) "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime (int) 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime (int) 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay (int) -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay (int) 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin (chr) "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest (chr) "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance (int) 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn (int) 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut (int) 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode (chr) "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ loss (int) -10, -10, 0, 0, -8, -6, 0, -11, 1, 0, 0, -14...
## $ loss_ratio (dbl) -Inf, -10.00000000, 0.00000000, 0.00000000, ...
Additionally, examples for filter and arrange:
# Examples for filter
filter(hflights, Distance >= 3000) # All flights that traveled 3000 miles or more
## Source: local data frame [527 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 31 1 924 1413 CO
## 2 2011 1 30 7 925 1410 CO
## 3 2011 1 29 6 1045 1445 CO
## 4 2011 1 28 5 1516 1916 CO
## 5 2011 1 27 4 950 1344 CO
## 6 2011 1 26 3 944 1350 CO
## 7 2011 1 25 2 924 1337 CO
## 8 2011 1 24 1 1144 1605 CO
## 9 2011 1 23 7 926 1335 CO
## 10 2011 1 22 6 942 1340 CO
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
filter(hflights, UniqueCarrier %in% c("B6", "WN", "DL"))
## Source: local data frame [48,679 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 1 6 654 1124 B6
## 2 2011 1 1 6 1639 2110 B6
## 3 2011 1 2 7 703 1113 B6
## 4 2011 1 2 7 1604 2040 B6
## 5 2011 1 3 1 659 1100 B6
## 6 2011 1 3 1 1801 2200 B6
## 7 2011 1 4 2 654 1103 B6
## 8 2011 1 4 2 1608 2034 B6
## 9 2011 1 5 3 700 1103 B6
## 10 2011 1 5 3 1544 1954 B6
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
filter(hflights, (TaxiIn + TaxiOut) > AirTime) # Flights where taxiing took longer than flying
## Source: local data frame [1,389 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 24 1 731 904 AA
## 2 2011 1 30 7 1959 2132 AA
## 3 2011 1 24 1 1621 1749 AA
## 4 2011 1 10 1 941 1113 AA
## 5 2011 1 31 1 1301 1356 CO
## 6 2011 1 31 1 2113 2215 CO
## 7 2011 1 31 1 1434 1539 CO
## 8 2011 1 31 1 900 1006 CO
## 9 2011 1 30 7 1304 1408 CO
## 10 2011 1 30 7 2004 2128 CO
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
filter(hflights, DepTime < 500 | ArrTime > 2200) # Flights departed before 5am or arrived after 10pm
## Source: local data frame [27,799 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 4 2 2100 2207 AA
## 2 2011 1 14 5 2119 2229 AA
## 3 2011 1 10 1 1934 2235 AA
## 4 2011 1 26 3 1905 2211 AA
## 5 2011 1 30 7 1856 2209 AA
## 6 2011 1 9 7 1938 2228 AS
## 7 2011 1 31 1 1919 2231 CO
## 8 2011 1 31 1 2116 2344 CO
## 9 2011 1 31 1 1850 2211 CO
## 10 2011 1 31 1 2102 2216 CO
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
filter(hflights, DepDelay > 0, ArrDelay < 0) # Flights that departed late but arrived ahead of schedule
## Source: local data frame [27,712 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 2 7 1401 1501 AA
## 2 2011 1 5 3 1405 1507 AA
## 3 2011 1 18 2 1408 1508 AA
## 4 2011 1 18 2 721 827 AA
## 5 2011 1 12 3 2015 2113 AA
## 6 2011 1 13 4 2020 2116 AA
## 7 2011 1 26 3 2009 2103 AA
## 8 2011 1 1 6 1631 1736 AA
## 9 2011 1 10 1 1639 1740 AA
## 10 2011 1 12 3 1631 1739 AA
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
filter(hflights, Cancelled == 1, DepDelay > 0) # Flights that were cancelled after being delayed
## Source: local data frame [40 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 26 3 1926 NA CO
## 2 2011 1 11 2 1100 NA US
## 3 2011 1 19 3 1811 NA XE
## 4 2011 1 7 5 2028 NA XE
## 5 2011 2 4 5 1638 NA AA
## 6 2011 2 8 2 1057 NA CO
## 7 2011 2 2 3 802 NA XE
## 8 2011 2 9 3 904 NA XE
## 9 2011 2 1 2 1508 NA OO
## 10 2011 3 31 4 1016 NA CO
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
c1 <- filter(hflights, Dest == "JFK") # Flights that had JFK as their destination: c1
c2 <- mutate(c1, Date = paste(Year, Month, DayofMonth, sep="-")) # Create a Date column: c2
select(c2, Date, DepTime, ArrTime, TailNum) # Print out a selection of columns of c2
## Source: local data frame [695 x 4]
##
## Date DepTime ArrTime TailNum
## (chr) (int) (int) (chr)
## 1 2011-1-1 654 1124 N324JB
## 2 2011-1-1 1639 2110 N324JB
## 3 2011-1-2 703 1113 N324JB
## 4 2011-1-2 1604 2040 N324JB
## 5 2011-1-3 659 1100 N229JB
## 6 2011-1-3 1801 2200 N206JB
## 7 2011-1-4 654 1103 N267JB
## 8 2011-1-4 1608 2034 N267JB
## 9 2011-1-5 700 1103 N708JB
## 10 2011-1-5 1544 1954 N644JB
## .. ... ... ... ...
dtc <- filter(hflights, Cancelled == 1, !is.na(DepDelay)) # Definition of dtc
# Examples for arrange
arrange(dtc, DepDelay) # Arrange dtc by departure delays
## Source: local data frame [68 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 7 23 6 605 NA F9
## 2 2011 1 17 1 916 NA XE
## 3 2011 12 1 4 541 NA US
## 4 2011 10 12 3 2022 NA MQ
## 5 2011 7 29 5 1424 NA CO
## 6 2011 9 29 4 1639 NA OO
## 7 2011 2 9 3 555 NA MQ
## 8 2011 5 9 1 715 NA OO
## 9 2011 1 20 4 1413 NA UA
## 10 2011 1 17 1 831 NA WN
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
arrange(dtc, CancellationCode) # Arrange dtc so that cancellation reasons are grouped
## Source: local data frame [68 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 20 4 1413 NA UA
## 2 2011 1 7 5 2028 NA XE
## 3 2011 2 4 5 1638 NA AA
## 4 2011 2 8 2 1057 NA CO
## 5 2011 2 1 2 1508 NA OO
## 6 2011 2 21 1 2257 NA OO
## 7 2011 2 9 3 555 NA MQ
## 8 2011 3 18 5 727 NA UA
## 9 2011 4 4 1 1632 NA DL
## 10 2011 4 8 5 1608 NA WN
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
arrange(dtc, UniqueCarrier, DepDelay) # Arrange dtc according to carrier and departure delays
## Source: local data frame [68 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 8 18 4 1808 NA AA
## 2 2011 2 4 5 1638 NA AA
## 3 2011 7 29 5 1424 NA CO
## 4 2011 1 26 3 1703 NA CO
## 5 2011 8 11 4 1320 NA CO
## 6 2011 7 25 1 1654 NA CO
## 7 2011 1 26 3 1926 NA CO
## 8 2011 3 31 4 1016 NA CO
## 9 2011 2 8 2 1057 NA CO
## 10 2011 4 4 1 1632 NA DL
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
arrange(hflights, UniqueCarrier, desc(DepDelay)) # Arrange by carrier and decreasing departure delays
## Source: local data frame [227,496 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 12 12 1 650 808 AA
## 2 2011 11 19 6 1752 1910 AA
## 3 2011 12 22 4 1728 1848 AA
## 4 2011 10 23 7 2305 2 AA
## 5 2011 9 27 2 1206 1300 AA
## 6 2011 3 17 4 1647 1747 AA
## 7 2011 6 21 2 955 1315 AA
## 8 2011 5 20 5 2359 130 AA
## 9 2011 4 19 2 2023 2142 AA
## 10 2011 5 12 4 2133 53 AA
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
arrange(hflights, DepDelay + ArrDelay) # Arrange flights by total delay (normal order)
## Source: local data frame [227,496 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 7 3 7 1914 2039 XE
## 2 2011 8 31 3 934 1039 OO
## 3 2011 8 21 7 935 1039 OO
## 4 2011 8 28 7 2059 2206 OO
## 5 2011 8 29 1 935 1041 OO
## 6 2011 12 25 7 741 926 OO
## 7 2011 1 30 7 620 812 OO
## 8 2011 8 3 3 1741 1810 XE
## 9 2011 8 4 4 930 1041 OO
## 10 2011 8 18 4 939 1043 OO
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
Additionally, examples for the summarize verb:
# Print out a summary with variables min_dist and max_dist
summarize(hflights, min_dist = min(Distance), max_dist = max(Distance))
## Source: local data frame [1 x 2]
##
## min_dist max_dist
## (int) (int)
## 1 79 3904
# Print out a summary with variable max_div
summarize(filter(hflights, Diverted == 1), max_div = max(Distance))
## Source: local data frame [1 x 1]
##
## max_div
## (int)
## 1 3904
# Remove rows that have NA ArrDelay: temp1
temp1 <- filter(hflights, !is.na(ArrDelay))
# Generate summary about ArrDelay column of temp1
summarize(temp1, earliest=min(ArrDelay), average=mean(ArrDelay), latest=max(ArrDelay), sd=sd(ArrDelay))
## Source: local data frame [1 x 4]
##
## earliest average latest sd
## (int) (dbl) (int) (dbl)
## 1 -70 7.094334 978 30.70852
# Keep rows that have no NA TaxiIn and no NA TaxiOut: temp2
temp2 <- filter(hflights, !is.na(TaxiIn), !is.na(TaxiOut))
# Print the maximum taxiing difference of temp2 with summarise()
summarize(temp2, max_taxi_diff = max(abs(TaxiIn - TaxiOut)))
## Source: local data frame [1 x 1]
##
## max_taxi_diff
## (int)
## 1 160
# Generate summarizing statistics for hflights
summarize(hflights, n_obs = n(), n_carrier = n_distinct(UniqueCarrier), n_dest = n_distinct(Dest))
## Source: local data frame [1 x 3]
##
## n_obs n_carrier n_dest
## (int) (int) (int)
## 1 227496 15 116
# All American Airline flights
aa <- filter(hflights, UniqueCarrier == "AA")
# Generate summarizing statistics for aa
summarize(aa, n_flights = n(), n_canc = sum(Cancelled), avg_delay = mean(ArrDelay, na.rm=TRUE))
## Source: local data frame [1 x 3]
##
## n_flights n_canc avg_delay
## (int) (int) (dbl)
## 1 3244 60 0.8917558
Additionally, examples for the pipe/chain as per magrittr:
# Find the average delta in taxi times
hflights %>%
mutate(diff = (TaxiOut - TaxiIn)) %>%
filter(!is.na(diff)) %>%
summarize(avg = mean(diff))
## Source: local data frame [1 x 1]
##
## avg
## (dbl)
## 1 8.992064
# Find flights that average less than 70 mph assuming 100 wasted minutes per flight
hflights %>%
mutate(RealTime = ActualElapsedTime + 100, mph = 60 * Distance / RealTime) %>%
filter(!is.na(mph), mph < 70) %>%
summarize(n_less = n(), n_dest = n_distinct(Dest), min_dist = min(Distance), max_dist = max(Distance))
## Source: local data frame [1 x 4]
##
## n_less n_dest min_dist max_dist
## (int) (int) (int) (int)
## 1 6726 13 79 305
# Find flights that average less than 105 mph, or that are diverted/cancelled
hflights %>%
mutate(RealTime = ActualElapsedTime + 100, mph = Distance / RealTime * 60) %>%
filter(mph < 105 | Cancelled == 1 | Diverted == 1) %>%
summarize(n_non = n(), n_dest = n_distinct(Dest), min_dist = min(Distance), max_dist = max(Distance))
## Source: local data frame [1 x 4]
##
## n_non n_dest min_dist max_dist
## (int) (int) (int) (int)
## 1 42400 113 79 3904
# Find overnight flights
filter(hflights, !is.na(DepTime), !is.na(ArrTime), DepTime > ArrTime) %>%
summarize(num = n())
## Source: local data frame [1 x 1]
##
## num
## (int)
## 1 2718
There is also the group_by capability, typically for use with summarize:
# Make an ordered per-carrier summary of hflights
group_by(hflights, UniqueCarrier) %>%
summarize(p_canc = 100 * mean(Cancelled, na.rm=TRUE), avg_delay = mean(ArrDelay, na.rm=TRUE)) %>%
arrange(avg_delay, p_canc)
## Source: local data frame [15 x 3]
##
## UniqueCarrier p_canc avg_delay
## (chr) (dbl) (dbl)
## 1 US 1.1268986 -0.6307692
## 2 AA 1.8495684 0.8917558
## 3 FL 0.9817672 1.8536239
## 4 AS 0.0000000 3.1923077
## 5 YV 1.2658228 4.0128205
## 6 DL 1.5903067 6.0841374
## 7 CO 0.6782614 6.0986983
## 8 MQ 2.9044750 7.1529751
## 9 EV 3.4482759 7.2569543
## 10 WN 1.5504047 7.5871430
## 11 F9 0.7159905 7.6682692
## 12 XE 1.5495599 8.1865242
## 13 OO 1.3946828 8.6934922
## 14 B6 2.5899281 9.8588410
## 15 UA 1.6409266 10.4628628
# Ordered overview of average arrival delays per carrier
hflights %>%
filter(!is.na(ArrDelay), ArrDelay > 0) %>%
group_by(UniqueCarrier) %>%
summarize(avg = mean(ArrDelay)) %>%
mutate(rank = rank(avg)) %>%
arrange(rank)
## Source: local data frame [15 x 3]
##
## UniqueCarrier avg rank
## (chr) (dbl) (dbl)
## 1 YV 18.67568 1
## 2 F9 18.68683 2
## 3 US 20.70235 3
## 4 CO 22.13374 4
## 5 AS 22.91195 5
## 6 OO 24.14663 6
## 7 XE 24.19337 7
## 8 WN 25.27750 8
## 9 FL 27.85693 9
## 10 AA 28.49740 10
## 11 DL 32.12463 11
## 12 UA 32.48067 12
## 13 MQ 38.75135 13
## 14 EV 40.24231 14
## 15 B6 45.47744 15
# How many airplanes only flew to one destination?
hflights %>%
group_by(TailNum) %>%
summarise(destPerTail = n_distinct(Dest)) %>%
filter(destPerTail == 1) %>%
summarise(nplanes=n())
## Source: local data frame [1 x 1]
##
## nplanes
## (int)
## 1 1526
# Find the most visited destination for each carrier
hflights %>%
group_by(UniqueCarrier, Dest) %>%
summarise(n = n()) %>%
mutate(rank = rank(-n)) %>%
filter(rank == 1)
## Source: local data frame [15 x 4]
## Groups: UniqueCarrier [15]
##
## UniqueCarrier Dest n rank
## (chr) (chr) (int) (dbl)
## 1 AA DFW 2105 1
## 2 AS SEA 365 1
## 3 B6 JFK 695 1
## 4 CO EWR 3924 1
## 5 DL ATL 2396 1
## 6 EV DTW 851 1
## 7 F9 DEN 837 1
## 8 FL ATL 2029 1
## 9 MQ DFW 2424 1
## 10 OO COS 1335 1
## 11 UA SFO 643 1
## 12 US CLT 2212 1
## 13 WN DAL 8243 1
## 14 XE CRP 3175 1
## 15 YV CLT 71 1
# Use summarise to calculate n_carrier
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, last
## The following object is masked from 'package:purrr':
##
## transpose
hflights2 <- as.data.table(hflights)
hflights2 %>%
summarize(n_carrier = n_distinct(UniqueCarrier))
## n_carrier
## 1: 15
And, dplyr can be used with databases, including writing the SQL query that matches to the dplyr request. The results are cached to avoid constantly pinging the server:
# Set up a connection to the mysql database
my_db <- src_mysql(dbname = "dplyr",
host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "student",
password = "datacamp")
# Reference a table within that source: nycflights
nycflights <- tbl(my_db, "dplyr")
# glimpse at nycflights
glimpse(nycflights)
## Observations: 336,776
## Variables: 17
## $ id (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ year (int) 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013...
## $ month (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ day (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ dep_time (int) 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 55...
## $ dep_delay (int) 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2,...
## $ arr_time (int) 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 8...
## $ arr_delay (int) 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7,...
## $ carrier (chr) "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6"...
## $ tailnum (chr) "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N...
## $ flight (int) 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301...
## $ origin (chr) "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LG...
## $ dest (chr) "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IA...
## $ air_time (int) 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149...
## $ distance (int) 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 73...
## $ hour (int) 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6...
## $ minute (int) 17, 33, 42, 44, 54, 54, 55, 57, 57, 58, 58, 58, 58, ...
# Ordered, grouped summary of nycflights
nycflights %>%
group_by(carrier) %>%
summarize(n_flights = n(), avg_delay = mean(arr_delay)) %>%
arrange(avg_delay)
## Source: mysql 5.6.23-log [student@courses.csrrinzqubik.us-east-1.rds.amazonaws.com:/dplyr]
## From: <derived table> [?? x 3]
## Arrange: avg_delay
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
## carrier n_flights avg_delay
## (chr) (dbl) (dbl)
## 1 AS 714 -9.8613
## 2 HA 342 -6.9152
## 3 AA 32729 0.3556
## 4 DL 48110 1.6289
## 5 VX 5162 1.7487
## 6 US 20536 2.0565
## 7 UA 58665 3.5045
## 8 9E 18460 6.9135
## 9 B6 54635 9.3565
## 10 WN 12275 9.4675
## .. ... ... ...
The data.table library is designed to simplify and speed up work with large datasets. The language is broadly analogous to SQL, with syntax that includes equivalents for SELECT, WHERE, and GROUP BY. Some general attributes of a data.table object include:
NOTE - all data.table are also data.frame, and if a package is not aware of data.table, then it will act as data.frame for that package.
General syntax is:
Example table creation:
Some example code includes:
library(data.table)
DT <- data.table(a = c(1, 2), b=LETTERS[1:4])
str(DT)
## Classes 'data.table' and 'data.frame': 4 obs. of 2 variables:
## $ a: num 1 2 1 2
## $ b: chr "A" "B" "C" "D"
## - attr(*, ".internal.selfref")=<externalptr>
DT
## a b
## 1: 1 A
## 2: 2 B
## 3: 1 C
## 4: 2 D
# Print the second to last row of DT using .N
DT[.N-1]
## a b
## 1: 1 C
# Print the column names of DT
names(DT)
## [1] "a" "b"
# Print the number or rows and columns of DT
dim(DT)
## [1] 4 2
# Select row 2 twice and row 3, returning a data.table with three rows where row 2 is a duplicate of row 1.
DT[c(2, 2:3)]
## a b
## 1: 2 B
## 2: 2 B
## 3: 1 C
DT <- data.table(A = 1:5, B = letters[1:5], C = 6:10)
str(DT)
## Classes 'data.table' and 'data.frame': 5 obs. of 3 variables:
## $ A: int 1 2 3 4 5
## $ B: chr "a" "b" "c" "d" ...
## $ C: int 6 7 8 9 10
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: 1 a 6
## 2: 2 b 7
## 3: 3 c 8
## 4: 4 d 9
## 5: 5 e 10
# Subset rows 1 and 3, and columns B and C
DT[c(1, 3), .(B, C)]
## B C
## 1: a 6
## 2: c 8
# Assign to ans the correct value
ans <- DT[ , .(B, val=A*C)]
ans
## B val
## 1: a 6
## 2: b 14
## 3: c 24
## 4: d 36
## 5: e 50
# Fill in the blanks such that ans2 equals target
target <- data.table(B = c("a", "b", "c", "d", "e", "a", "b", "c", "d", "e"),
val = as.integer(c(6:10, 1:5))
)
ans2 <- DT[, .(B, val = c(C, A))]
identical(target, ans2)
## [1] TRUE
DT <- as.data.table(iris)
str(DT)
## Classes 'data.table' and 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# For each Species, print the mean Sepal.Length
DT[ , mean(Sepal.Length), Species]
## Species V1
## 1: setosa 5.006
## 2: versicolor 5.936
## 3: virginica 6.588
# Print mean Sepal.Length, grouping by first letter of Species
DT[ , mean(Sepal.Length), substr(Species, 1, 1)]
## substr V1
## 1: s 5.006
## 2: v 6.262
str(DT)
## Classes 'data.table' and 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
identical(DT, as.data.table(iris))
## [1] TRUE
# Group the specimens by Sepal area (to the nearest 10 cm2) and count how many occur in each group.
DT[, .N, by = 10 * round(Sepal.Length * Sepal.Width / 10)]
## round N
## 1: 20 117
## 2: 10 29
## 3: 30 4
# Now name the output columns `Area` and `Count`
DT[, .(Count=.N), by = .(Area = 10 * round(Sepal.Length * Sepal.Width / 10))]
## Area Count
## 1: 20 117
## 2: 10 29
## 3: 30 4
# Create the data.table DT
set.seed(1L)
DT <- data.table(A = rep(letters[2:1], each = 4L),
B = rep(1:4, each = 2L),
C = sample(8)
)
str(DT)
## Classes 'data.table' and 'data.frame': 8 obs. of 3 variables:
## $ A: chr "b" "b" "b" "b" ...
## $ B: int 1 1 2 2 3 3 4 4
## $ C: int 3 8 4 5 1 7 2 6
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: b 1 3
## 2: b 1 8
## 3: b 2 4
## 4: b 2 5
## 5: a 3 1
## 6: a 3 7
## 7: a 4 2
## 8: a 4 6
# Create the new data.table, DT2
DT2 <- DT[, .(C = cumsum(C)), by = .(A, B)]
str(DT2)
## Classes 'data.table' and 'data.frame': 8 obs. of 3 variables:
## $ A: chr "b" "b" "b" "b" ...
## $ B: int 1 1 2 2 3 3 4 4
## $ C: int 3 11 4 9 1 8 2 8
## - attr(*, ".internal.selfref")=<externalptr>
DT2
## A B C
## 1: b 1 3
## 2: b 1 11
## 3: b 2 4
## 4: b 2 9
## 5: a 3 1
## 6: a 3 8
## 7: a 4 2
## 8: a 4 8
# Select from DT2 the last two values from C while you group by A
DT2[, .(C = tail(C, 2)), by = A]
## A C
## 1: b 4
## 2: b 9
## 3: a 2
## 4: a 8
The chaining operation in data.table is run as [statement][next statement].
Example code includes:
set.seed(1L)
DT <- data.table(A = rep(letters[2:1], each = 4L),
B = rep(1:4, each = 2L),
C = sample(8))
str(DT)
## Classes 'data.table' and 'data.frame': 8 obs. of 3 variables:
## $ A: chr "b" "b" "b" "b" ...
## $ B: int 1 1 2 2 3 3 4 4
## $ C: int 3 8 4 5 1 7 2 6
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: b 1 3
## 2: b 1 8
## 3: b 2 4
## 4: b 2 5
## 5: a 3 1
## 6: a 3 7
## 7: a 4 2
## 8: a 4 6
# Perform operation using chaining
DT[ , .(C = cumsum(C)), by = .(A, B)][ , .(C = tail(C, 2)), by=.(A)]
## A C
## 1: b 4
## 2: b 9
## 3: a 2
## 4: a 8
data(iris)
DT <- as.data.table(iris)
str(DT)
## Classes 'data.table' and 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Perform chained operations on DT
DT[ , .(Sepal.Length = median(Sepal.Length), Sepal.Width = median(Sepal.Width),
Petal.Length = median(Petal.Length), Petal.Width = median(Petal.Width)),
by=.(Species)][order(-Species)]
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1: virginica 6.5 3.0 5.55 2.0
## 2: versicolor 5.9 2.8 4.35 1.3
## 3: setosa 5.0 3.4 1.50 0.2
# Mean of columns
# DT[ , lapply(.SD, FUN=mean), by=.(x)]
# Median of columns
# DT[ , lapply(.SD, FUN=median), by=.(x)]
# Calculate the sum of the Q columns
# DT[ , lapply(.SD, FUN=sum), , .SDcols=2:4]
# Calculate the sum of columns H1 and H2
# DT[ , lapply(.SD, FUN=sum), , .SDcols=paste0("H", 1:2)]
# Select all but the first row of groups 1 and 2, returning only the grp column and the Q columns
# foo = function(x) { x[-1] }
# DT[ , lapply(.SD, FUN=foo), by=.(grp), .SDcols=paste0("Q", 1:3)]
# Sum of all columns and the number of rows
# DT[, c(lapply(.SD, FUN=sum), .N), by=.(x), .SDcols=names(DT)]
# Cumulative sum of column x and y while grouping by x and z > 8
# DT[, lapply(.SD, FUN=cumsum), by=.(by1=x, by2=(z>8)), .SDcols=c("x", "y")]
# Chaining
# DT[, lapply(.SD, FUN=cumsum), by=.(by1=x, by2=(z>8)), .SDcols=c("x", "y")][ , lapply(.SD, FUN=max), by=.(by1), .SDcols=c("x", "y")]
# The data.table DT
DT <- data.table(A = letters[c(1, 1, 1, 2, 2)], B = 1:5)
str(DT)
## Classes 'data.table' and 'data.frame': 5 obs. of 2 variables:
## $ A: chr "a" "a" "a" "b" ...
## $ B: int 1 2 3 4 5
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B
## 1: a 1
## 2: a 2
## 3: a 3
## 4: b 4
## 5: b 5
# Add column by reference: Total
DT[ , Total:=sum(B), by=.(A)]
DT
## A B Total
## 1: a 1 6
## 2: a 2 6
## 3: a 3 6
## 4: b 4 9
## 5: b 5 9
# Add 1 to column B
DT[c(2,4) , B:=B+1L, ]
DT
## A B Total
## 1: a 1 6
## 2: a 3 6
## 3: a 3 6
## 4: b 5 9
## 5: b 5 9
# Add a new column Total2
DT[2:4, Total2:=sum(B), by=.(A)]
DT
## A B Total Total2
## 1: a 1 6 NA
## 2: a 3 6 6
## 3: a 3 6 6
## 4: b 5 9 5
## 5: b 5 9 NA
# Remove the Total column
DT[ , Total := NULL, ]
DT
## A B Total2
## 1: a 1 NA
## 2: a 3 6
## 3: a 3 6
## 4: b 5 5
## 5: b 5 NA
# Select the third column using `[[`
DT[[3]]
## [1] NA 6 6 5 NA
# A data.table DT has been created for you
DT <- data.table(A = c(1, 1, 1, 2, 2), B = 1:5)
str(DT)
## Classes 'data.table' and 'data.frame': 5 obs. of 2 variables:
## $ A: num 1 1 1 2 2
## $ B: int 1 2 3 4 5
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B
## 1: 1 1
## 2: 1 2
## 3: 1 3
## 4: 2 4
## 5: 2 5
# Update B, add C and D
DT[ , c("B", "C", "D") := .(B + 1, A + B, 2), ]
DT
## A B C D
## 1: 1 2 2 2
## 2: 1 3 3 2
## 3: 1 4 4 2
## 4: 2 5 6 2
## 5: 2 6 7 2
# Delete my_cols
my_cols <- c("B", "C")
DT[ , (my_cols) := NULL, ]
DT
## A D
## 1: 1 2
## 2: 1 2
## 3: 1 2
## 4: 2 2
## 5: 2 2
# Delete column 2 by number
DT[[2]] <- NULL
DT
## A
## 1: 1
## 2: 1
## 3: 1
## 4: 2
## 5: 2
# Set the seed
# set.seed(1)
# Check the DT that is made available to you
# DT
# For loop with set
# for(i in 2:4) { set(DT, sample(nrow(DT), 3), i, NA) }
# Change the column names to lowercase
# setnames(DT, letters[1:4])
# Print the resulting DT to the console
# DT
# Define DT
DT <- data.table(a = letters[c(1, 1, 1, 2, 2)], b = 1)
str(DT)
## Classes 'data.table' and 'data.frame': 5 obs. of 2 variables:
## $ a: chr "a" "a" "a" "b" ...
## $ b: num 1 1 1 1 1
## - attr(*, ".internal.selfref")=<externalptr>
DT
## a b
## 1: a 1
## 2: a 1
## 3: a 1
## 4: b 1
## 5: b 1
# Add a suffix "_2" to all column names
setnames(DT, paste0(names(DT), "_2"))
DT
## a_2 b_2
## 1: a 1
## 2: a 1
## 3: a 1
## 4: b 1
## 5: b 1
# Change column name "a_2" to "A2"
setnames(DT, "a_2", "A2")
DT
## A2 b_2
## 1: a 1
## 2: a 1
## 3: a 1
## 4: b 1
## 5: b 1
# Reverse the order of the columns
setcolorder(DT, 2:1)
DT
## b_2 A2
## 1: 1 a
## 2: 1 a
## 3: 1 a
## 4: 1 b
## 5: 1 b
Example code includes:
# iris as a data.table
iris <- as.data.table(iris)
# Remove the "Sepal." prefix
names(iris) <- gsub("Sepal\\.", "", names(iris))
# Remove the two columns starting with "Petal"
iris[, c("Petal.Length", "Petal.Width") := NULL, ]
# Cleaned up iris data.table
str(iris)
## Classes 'data.table' and 'data.frame': 150 obs. of 3 variables:
## $ Length : num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Species: Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Area is greater than 20 square centimeters
iris[ Width * Length > 20 ]
## Length Width Species
## 1: 5.4 3.9 setosa
## 2: 5.8 4.0 setosa
## 3: 5.7 4.4 setosa
## 4: 5.4 3.9 setosa
## 5: 5.7 3.8 setosa
## 6: 5.2 4.1 setosa
## 7: 5.5 4.2 setosa
## 8: 7.0 3.2 versicolor
## 9: 6.4 3.2 versicolor
## 10: 6.9 3.1 versicolor
## 11: 6.3 3.3 versicolor
## 12: 6.7 3.1 versicolor
## 13: 6.7 3.0 versicolor
## 14: 6.0 3.4 versicolor
## 15: 6.7 3.1 versicolor
## 16: 6.3 3.3 virginica
## 17: 7.1 3.0 virginica
## 18: 7.6 3.0 virginica
## 19: 7.3 2.9 virginica
## 20: 7.2 3.6 virginica
## 21: 6.5 3.2 virginica
## 22: 6.8 3.0 virginica
## 23: 6.4 3.2 virginica
## 24: 7.7 3.8 virginica
## 25: 7.7 2.6 virginica
## 26: 6.9 3.2 virginica
## 27: 7.7 2.8 virginica
## 28: 6.7 3.3 virginica
## 29: 7.2 3.2 virginica
## 30: 7.2 3.0 virginica
## 31: 7.4 2.8 virginica
## 32: 7.9 3.8 virginica
## 33: 7.7 3.0 virginica
## 34: 6.3 3.4 virginica
## 35: 6.9 3.1 virginica
## 36: 6.7 3.1 virginica
## 37: 6.9 3.1 virginica
## 38: 6.8 3.2 virginica
## 39: 6.7 3.3 virginica
## 40: 6.7 3.0 virginica
## 41: 6.2 3.4 virginica
## Length Width Species
# Add new boolean column
iris[, is_large := Width * Length > 25]
## Warning in `[.data.table`(iris, , `:=`(is_large, Width * Length > 25)):
## Invalid .internal.selfref detected and fixed by taking a (shallow) copy
## of the data.table so that := can add this new column by reference. At
## an earlier point, this data.table has been copied by R (or been created
## manually using structure() or similar). Avoid key<-, names<- and attr<-
## which in R currently (and oddly) may copy the whole data.table. Use set*
## syntax instead to avoid copying: ?set, ?setnames and ?setattr. Also, in
## R<=v3.0.2, list(DT1,DT2) copied the entire DT1 and DT2 (R's list() used to
## copy named objects); please upgrade to R>v3.0.2 if that is biting. If this
## message doesn't help, please report to datatable-help so the root cause can
## be fixed.
# Now large observations with is_large
iris[is_large == TRUE]
## Length Width Species is_large
## 1: 5.7 4.4 setosa TRUE
## 2: 7.2 3.6 virginica TRUE
## 3: 7.7 3.8 virginica TRUE
## 4: 7.9 3.8 virginica TRUE
iris[(is_large)] # Also OK
## Length Width Species is_large
## 1: 5.7 4.4 setosa TRUE
## 2: 7.2 3.6 virginica TRUE
## 3: 7.7 3.8 virginica TRUE
## 4: 7.9 3.8 virginica TRUE
# The 'keyed' data.table DT
DT <- data.table(A = letters[c(2, 1, 2, 3, 1, 2, 3)],
B = c(5, 4, 1, 9, 8, 8, 6),
C = 6:12)
setkey(DT, A, B)
str(DT)
## Classes 'data.table' and 'data.frame': 7 obs. of 3 variables:
## $ A: chr "a" "a" "b" "b" ...
## $ B: num 4 8 1 5 8 6 9
## $ C: int 7 10 8 6 11 12 9
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "sorted")= chr "A" "B"
DT
## A B C
## 1: a 4 7
## 2: a 8 10
## 3: b 1 8
## 4: b 5 6
## 5: b 8 11
## 6: c 6 12
## 7: c 9 9
# Select the "b" group
DT["b"]
## A B C
## 1: b 1 8
## 2: b 5 6
## 3: b 8 11
# "b" and "c" groups
DT[c("b", "c")]
## A B C
## 1: b 1 8
## 2: b 5 6
## 3: b 8 11
## 4: c 6 12
## 5: c 9 9
# The first row of the "b" and "c" groups
DT[c("b", "c"), mult = "first"]
## A B C
## 1: b 1 8
## 2: c 6 12
# First and last row of the "b" and "c" groups
DT[c("b", "c"), .SD[c(1, .N)], by = .EACHI]
## A B C
## 1: b 1 8
## 2: b 8 11
## 3: c 6 12
## 4: c 9 9
# Copy and extend code for instruction 4: add printout
DT[c("b", "c"), { print(.SD); .SD[c(1, .N)] }, by = .EACHI]
## B C
## 1: 1 8
## 2: 5 6
## 3: 8 11
## B C
## 1: 6 12
## 2: 9 9
## A B C
## 1: b 1 8
## 2: b 8 11
## 3: c 6 12
## 4: c 9 9
# Keyed data.table DT
DT <- data.table(A = letters[c(2, 1, 2, 3, 1, 2, 3)],
B = c(5, 4, 1, 9, 8, 8, 6),
C = 6:12,
key = "A,B")
str(DT)
## Classes 'data.table' and 'data.frame': 7 obs. of 3 variables:
## $ A: chr "a" "a" "b" "b" ...
## $ B: num 4 8 1 5 8 6 9
## $ C: int 7 10 8 6 11 12 9
## - attr(*, "sorted")= chr "A" "B"
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: a 4 7
## 2: a 8 10
## 3: b 1 8
## 4: b 5 6
## 5: b 8 11
## 6: c 6 12
## 7: c 9 9
# Get the key of DT
key(DT)
## [1] "A" "B"
# Row where A == "b" and B == 6
DT[.("b", 6)]
## A B C
## 1: b 6 NA
# Return the prevailing row
DT[.("b", 6), roll=TRUE]
## A B C
## 1: b 6 6
# Return the nearest row
DT[.("b", 6), roll="nearest"]
## A B C
## 1: b 6 6
# Keyed data.table DT
DT <- data.table(A = letters[c(2, 1, 2, 3, 1, 2, 3)],
B = c(5, 4, 1, 9, 8, 8, 6),
C = 6:12,
key = "A,B")
str(DT)
## Classes 'data.table' and 'data.frame': 7 obs. of 3 variables:
## $ A: chr "a" "a" "b" "b" ...
## $ B: num 4 8 1 5 8 6 9
## $ C: int 7 10 8 6 11 12 9
## - attr(*, "sorted")= chr "A" "B"
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: a 4 7
## 2: a 8 10
## 3: b 1 8
## 4: b 5 6
## 5: b 8 11
## 6: c 6 12
## 7: c 9 9
# Print the sequence (-2):10 for the "b" group
DT[.("b", (-2):10)]
## A B C
## 1: b -2 NA
## 2: b -1 NA
## 3: b 0 NA
## 4: b 1 8
## 5: b 2 NA
## 6: b 3 NA
## 7: b 4 NA
## 8: b 5 6
## 9: b 6 NA
## 10: b 7 NA
## 11: b 8 11
## 12: b 9 NA
## 13: b 10 NA
# Add code: carry the prevailing values forwards
DT[.("b", (-2):10), roll=TRUE]
## A B C
## 1: b -2 NA
## 2: b -1 NA
## 3: b 0 NA
## 4: b 1 8
## 5: b 2 8
## 6: b 3 8
## 7: b 4 8
## 8: b 5 6
## 9: b 6 6
## 10: b 7 6
## 11: b 8 11
## 12: b 9 11
## 13: b 10 11
# Add code: carry the first observation backwards
DT[.("b", (-2):10), roll=TRUE, rollends=TRUE]
## A B C
## 1: b -2 8
## 2: b -1 8
## 3: b 0 8
## 4: b 1 8
## 5: b 2 8
## 6: b 3 8
## 7: b 4 8
## 8: b 5 6
## 9: b 6 6
## 10: b 7 6
## 11: b 8 11
## 12: b 9 11
## 13: b 10 11
Jeff Ryan, the creator of quantmod and organizer of the R/Finance conference, has developed xts and zoo to simplify working with time series data. The course will cover five areas (chapters):
“xts” stands for extensible time series. The core of each “xts” is a “zoo” object, consisting of a matrix plus an index.
There are a few special behaviors of xts:
The “xts” object can be de-constructed when needed:
Data usually already exists and needs to be “wrangled” in to a proper format for xts/zoo. The easiest way to convert is using as.xts(). You can coerce truly external data after loading it, and can also save data with Can also save with write.zoo(x, “file”).
Subsetting based on time is a particular strength of xts. xts supports ISO8601:2004 (the standard, “right way”, to unambiguously consider times):
xts allows for four methods of specifying dates or intervals:
Can also use some traditional R-like methods (since xts extends zoo, and zoo extends base R):
Can set the flag which.i = TRUE to get back the correct records (row numbers). For example, index <- x[“2007-06-26/2007-06-28”, which.i = TRUE].
Description of key behaviors when working with an xts object:
xts introduces a few relatives of the head() and tail() functionality. These are the first() and last() functions.
Math operations using xts - xts is a matrix - need to be careful about matrix operations. Math operations are run only on the intersection of items:
Merging time series is common. Merge (cbind, merge) combines by columns, but joining based on index.
Merge (rbind( combine by rows, though all rows must already have an index. Basically, the rbind MUST be used on a time series.
Missing data is common, and xts inherits all of the zoo methods for dealing with missing data. The locf is the “last observation carry forward” (latest value that is not NA) - called with na.locf:
The NA can be managed in several ways:
Lag operators and difference operations. Seasonality is a repeating pattern. There is often a need to compare seasonality – for example, compare Mondays. Stationarity refers to some bound of the series.
The lag() function will change the timestamp, so that (for example) today can be merged as last week:
The “one period lag first difference” is calculated as diff(x, lag=1, differences=1, arithmetic=TRUE, log=FALSE, na.pad=TRUE, . ).
There are two main approaches for applying functions on discrete periods or intervals:
Time series aggregation can also be handled by xts:
Time series data can also be managed in a “rolling” manner - discrete or continuous:
Internals of xts such as indices and timezones:
Final topics:
Example code includes:
library(xts)
## Warning: package 'xts' was built under R version 3.2.5
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following object is masked from 'package:data.table':
##
## last
## The following objects are masked from 'package:dplyr':
##
## first, last
library(zoo)
x <- matrix(data=1:4, ncol=2)
idx <- as.Date(c("2015-01-01", "2015-02-01"))
# Create the xts
X <- xts(x, order.by = idx)
# Decosntruct the xts
coredata(X, fmt=FALSE)
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
index(X)
## [1] "2015-01-01" "2015-02-01"
# Working with the sunspots data
data(sunspots)
class(sunspots)
## [1] "ts"
sunspots_xts <- as.xts(sunspots)
class(sunspots_xts)
## [1] "xts" "zoo"
head(sunspots_xts)
## [,1]
## Jan 1749 58.0
## Feb 1749 62.6
## Mar 1749 70.0
## Apr 1749 55.7
## May 1749 85.0
## Jun 1749 83.5
# Example from chapter #1
ex_matrix <- xts(matrix(data=c(1, 1, 1, 2, 2, 2), ncol=2),
order.by=as.Date(c("2016-06-01", "2016-06-02", "2016-06-03"))
)
core <- coredata(ex_matrix)
# View the structure of ex_matrix
str(ex_matrix)
## An 'xts' object on 2016-06-01/2016-06-03 containing:
## Data: num [1:3, 1:2] 1 1 1 2 2 2
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## NULL
# Extract the 3rd observation of the 2nd column of ex_matrix
ex_matrix[3, 2]
## [,1]
## 2016-06-03 2
# Extract the 3rd observation of the 2nd column of core
core[3, 2]
## [1] 2
# Create the object data using 5 random numbers
data <- rnorm(5)
# Create dates as a Date class object starting from 2016-01-01
dates <- seq(as.Date("2016-01-01"), length = 5, by = "days")
# Use xts() to create smith
smith <- xts(x = data, order.by = dates)
# Create bday (1899-05-08) using a POSIXct date class object
bday <- as.POSIXct("1899-05-08")
# Create hayek and add a new attribute called born
hayek <- xts(x = data, order.by = dates, born = bday)
# Extract the core data of hayek
hayek_core <- coredata(hayek)
# View the class of hayek_core
class(hayek_core)
## [1] "matrix"
# Extract the index of hayek
hayek_index <- index(hayek)
# View the class of hayek_index
class(hayek_index)
## [1] "Date"
# Create dates
dates <- as.Date("2016-01-01") + 0:4
# Create ts_a
ts_a <- xts(x = 1:5, order.by = dates)
# Create ts_b
ts_b <- xts(x = 1:5, order.by = as.POSIXct(dates))
# Extract the rows of ts_a using the index of ts_b
ts_a[index(ts_b)]
## [,1]
## 2016-01-01 1
## 2016-01-02 2
## 2016-01-03 3
## 2016-01-04 4
## 2016-01-05 5
# Extract the rows of ts_b using the index of ts_a
ts_b[index(ts_a)]
## [,1]
data(austres)
# Convert austres to an xts object called au
au <- as.xts(austres)
# Convert your xts object (au) into a matrix am
am <- as.matrix(au)
# Convert the original austres into a matrix am2
am2 <- as.matrix(austres)
# Create dat by reading tmp_file
tmp_file <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1127/datasets/tmp_file.csv"
dat <- read.csv(tmp_file)
# Convert dat into xts
xts(dat, order.by = as.Date(rownames(dat), "%m/%d/%Y"))
## a b
## 2015-01-02 1 3
## 2015-02-03 2 4
# Read tmp_file using read.zoo
dat_zoo <- read.zoo(tmp_file, index.column = 0, sep = ",", format = "%m/%d/%Y")
# Convert dat_zoo to xts
dat_xts <- as.xts(dat_zoo)
# Convert sunspots to xts using as.xts(). Save this as sunspots_xts
sunspots_xts <- as.xts(sunspots)
# Get the temporary file name
tmp <- tempfile()
# Write the xts object using zoo to tmp
write.zoo(sunspots_xts, sep = ",", file = tmp)
# Read the tmp file. FUN = as.yearmon converts strings such as Jan 1749 into a proper time class
sun <- read.zoo(tmp, sep = ",", FUN = as.yearmon)
# Convert sun into xts. Save this as sun_xts
sun_xts <- as.xts(sun)
data(edhec, package="PerformanceAnalytics")
head(edhec["2007-01", 1])
## Convertible Arbitrage
## 2007-01-31 0.013
head(edhec["2007-01/2007-03", 1])
## Convertible Arbitrage
## 2007-01-31 0.0130
## 2007-02-28 0.0117
## 2007-03-31 0.0060
head(edhec["200701/03", 1])
## Convertible Arbitrage
## 2007-01-31 0.0130
## 2007-02-28 0.0117
## 2007-03-31 0.0060
first(edhec[, "Funds of Funds"], "4 months")
## Funds of Funds
## 1997-01-31 0.0317
## 1997-02-28 0.0106
## 1997-03-31 -0.0077
## 1997-04-30 0.0009
last(edhec[, "Funds of Funds"], "1 year")
## Funds of Funds
## 2009-01-31 0.0060
## 2009-02-28 -0.0037
## 2009-03-31 0.0008
## 2009-04-30 0.0092
## 2009-05-31 0.0312
## 2009-06-30 0.0024
## 2009-07-31 0.0153
## 2009-08-31 0.0113
Graphical tools help to visualize a dataset:
There are four graphics systems in R:
Preview of more and less useful techniques:
Quality of graph types - In general, pie charts are not as good as bar charts.
Example code includes:
# Load MASS package
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# Plot whiteside data
plot(whiteside)
# Plot Gas vs. Temp
plot(y=whiteside$Gas, x=whiteside$Temp, xlab="Outside temperature", ylab="Heating gas consumption")
# Apply the plot() function to Insul
plot(whiteside$Insul)
# Plot Max.Price vs. Price as red triangles
plot(x=Cars93$Price, y=Cars93$Max.Price, pch=17, col="red")
# Add Min.Price vs. Price as blue circles
points(x=Cars93$Price, y=Cars93$Min.Price, pch=16, col="blue")
# Add an equality reference line with abline()
abline(a = 0, b = 1, lty = 2)
# Load the robustbase package
library(robustbase)
## Warning: package 'robustbase' was built under R version 3.2.5
# Set up the side-by-side plot array
par(mfrow = c(1, 2))
# First plot: brain vs. body in its original form
plot(x=Animals2$body, y=Animals2$brain)
# Add the first title
title("Original representation")
# Second plot: log-log plot of brain vs. body
# First plot: brain vs. body in its original form
plot(x=Animals2$body, y=Animals2$brain, log="xy")
# Add the second title
title("Log-log plot")
par(mfrow = c(1, 1))
# Load the insuranceData package
library(insuranceData)
## Warning: package 'insuranceData' was built under R version 3.2.5
# Use the data() function to get the dataCar data frame
data(dataCar)
# Set up a side-by-side plot array
par(mfrow=c(1, 2))
# Create a table of veh_body record counts and sort
tbl <- sort(table(dataCar$veh_body),
decreasing = TRUE)
# Create the pie chart and give it a title
pie(tbl)
title("Pie chart")
# Create the barplot with perpendicular, half-sized labels
barplot(tbl, las = 2, cex.names = 0.5)
# Add a title
title("Bar chart")
par(mfrow = c(1, 1))
# Unload the MASS package so that dplyr works!
detach(package:MASS)
Charcaterizing a single variable
Relationships between two variables
Complex relationships between variables
Example code includes:
data(Cars93, package="MASS")
# Set up a side-by-side plot array
par(mfrow=c(1, 2))
# Create a histogram of counts with hist()
hist(Cars93$Horsepower, main="hist() plot")
# Create a normalized histogram with truehist()
MASS::truehist(Cars93$Horsepower, main="truehist() plot")
par(mfrow=c(1, 1))
# Create index16, pointing to 16-week chicks
index16 <- which(ChickWeight$Time == 16)
# Get the 16-week chick weights
weights <- ChickWeight$weight[index16]
# Plot the normalized histogram
MASS::truehist(weights)
# Add the density curve to the histogram
lines(density(weights))
# Create index16, pointing to 16-week chicks
index16 <- which(ChickWeight$Time == 16)
# Get the 16-week chick weights
weights <- ChickWeight$weight[index16]
# Show the normal QQ-plot of the chick weights
car::qqPlot(weights)
# Show the normal QQ-plot of the Boston$tax data
data(Boston, package="MASS")
car::qqPlot(Boston$tax)
# Set up a side-by-side plot array
par(mfrow=c(1, 2))
# Create the standard scatterplot
plot(rad ~ zn, data=Boston)
# Add the title
title("Standard scatterplot")
# Create the sunflowerplot
sunflowerplot(rad ~ zn, data=Boston)
# Add the title
title("Sunflower plot")
par(mfrow=c(1, 1))
# Create a variable-width boxplot with log y-axis & horizontal labels
boxplot(crim ~ rad, data=Boston, varwidth=TRUE, log="y", las=1)
# Add a title
title("Crime rate vs. radial highway index")
# Create a mosaic plot using the formula interface
mosaicplot(carb ~ cyl, data=mtcars)
# Create a side-by-side boxplot summary
boxplot(Cars93$Min.Price, Cars93$Max.Price)
# Create a bagplot for the same two variables
aplpack::bagplot(Cars93$Min.Price, Cars93$Max.Price, cex=1.2)
# Add an equality reference line
abline(a=0, b=1, lty=2)
# Extract the numerical variables from UScereal
data(UScereal, package="MASS")
numericalVars <- UScereal[, 2:10]
# Compute the correlation matrix for these variables
corrMat <- cor(numericalVars)
# Generate the correlation ellipse plot
corrplot::corrplot(corrMat, method="ellipse")
# Fit an rpart model to predict medv from all other Boston variables
tree_model <- rpart::rpart(medv ~ ., data=Boston)
# Plot the structure of this decision tree model
plot(tree_model)
# Add labels to this plot
text(tree_model, cex=0.7)
Options can be specified either globally (e.g., par(mfrow)) or locally (e.g., cex) or both (cex.main can be global or local):
Adding lines and points to plots can help highlight key details (e.g,, outliers in a different color and shape):
Adding text to plots helps to explain the findings for the reader:
Adding or modifying other plot details:
Example code includes:
# Assign the return value from the par() function to plot_pars
plot_pars <- par()
# Display the names of the par() function's list elements
names(plot_pars)
## [1] "xlog" "ylog" "adj" "ann" "ask"
## [6] "bg" "bty" "cex" "cex.axis" "cex.lab"
## [11] "cex.main" "cex.sub" "cin" "col" "col.axis"
## [16] "col.lab" "col.main" "col.sub" "cra" "crt"
## [21] "csi" "cxy" "din" "err" "family"
## [26] "fg" "fig" "fin" "font" "font.axis"
## [31] "font.lab" "font.main" "font.sub" "lab" "las"
## [36] "lend" "lheight" "ljoin" "lmitre" "lty"
## [41] "lwd" "mai" "mar" "mex" "mfcol"
## [46] "mfg" "mfrow" "mgp" "mkh" "new"
## [51] "oma" "omd" "omi" "page" "pch"
## [56] "pin" "plt" "ps" "pty" "smo"
## [61] "srt" "tck" "tcl" "usr" "xaxp"
## [66] "xaxs" "xaxt" "xpd" "yaxp" "yaxs"
## [71] "yaxt" "ylbias"
# Display the number of par() function list elements
length(plot_pars)
## [1] 72
# Set up a 2-by-2 plot array
par(mfrow=c(2, 2))
# Plot the Animals2 brain weight data as points
plot(Animals2$brain, type="p")
# Add the title
title("points")
# Plot the brain weights with lines
plot(Animals2$brain, type="l")
# Add the title
title("lines")
# Plot the brain weights as lines overlaid with points
plot(Animals2$brain, type="o")
# Add the title
title("overlaid")
# Plot the brain weights as steps
plot(Animals2$brain, type="s")
# Add the title
title("steps")
par(mfrow=c(1, 1))
# Compute max_hp
data(Cars93, package="MASS")
max_hp <- max(Cars93$Horsepower, mtcars$hp)
# Compute max_mpg
max_mpg <- max(Cars93$MPG.city, Cars93$MPG.highway, mtcars$mpg)
# Create plot with type = "n"
plot(mtcars$hp, mtcars$mpg,
type = "n", xlim = c(0, max_hp),
ylim = c(0, max_mpg), xlab = "Horsepower",
ylab = "Miles per gallon"
)
# Add open circles to plot
points(x=mtcars$hp, y=mtcars$mpg, pch = 1)
# Add solid squares to plot
points(x=Cars93$Horsepower, y=Cars93$MPG.city, pch = 15)
# Add open triangles to plot
points(x=Cars93$Horsepower, y=Cars93$MPG.highway, pch = 6)
# Create the numerical vector x
x <- seq(0, 10, length = 200)
# Compute the Gaussian density for x with mean 2 and standard deviation 0.2
gauss1 <- dnorm(x, mean = 2, sd = 0.2)
# Compute the Gaussian density with mean 4 and standard deviation 0.5
gauss2 <- dnorm(x, mean = 4, sd = 0.5)
# Plot the first Gaussian density
plot(y=gauss1, x=x, type="l", ylab="Gaussian probability density")
# Add lines for the second Gaussian density
lines(y=gauss2, x=x, lty=2, lwd=3)
# Create an empty plot using type = "n"
plot(x=mtcars$hp, y=mtcars$mpg, type="n", xlab="Horsepower", ylab="Gas mileage")
# Add points with shapes determined by cylinder number
points(x=mtcars$hp, y=mtcars$mpg, pch = as.numeric(mtcars$cyl))
# Create a second empty plot
plot(x=mtcars$hp, y=mtcars$mpg, type="n", xlab="Horsepower", ylab="Gas mileage")
# Add points with shapes as cylinder characters
points(x=mtcars$hp, y=mtcars$mpg, pch = as.character(mtcars$cyl))
# Build a linear regression model for the whiteside data
data(whiteside, package="MASS")
linear_model <- lm(Gas ~ Temp, data=whiteside)
# Create a Gas vs. Temp scatterplot from the whiteside data
plot(y=whiteside$Gas, x=whiteside$Temp)
# Use abline() to add the linear regression line
abline(linear_model, lty=2)
# Create MPG.city vs. Horsepower plot with solid squares
data(Cars93, package="MASS")
plot(x=Cars93$Horsepower, y=Cars93$MPG.city, pch=15)
# Create index3, pointing to 3-cylinder cars
index3 <- which(Cars93$Cylinders == 3)
# Add text giving names of cars next to data points
text(x = Cars93$Horsepower[index3],
y = Cars93$MPG.city[index3],
labels = Cars93$Make[index3], adj = 0)
# Plot MPG.city vs. Horsepower as open circles
data(Cars93, package="MASS")
plot(x=Cars93$Horsepower, y=Cars93$MPG.city, pch=1)
# Create index3, pointing to 3-cylinder cars
index3 <- which(Cars93$Cylinders == 3)
# Highlight 3-cylinder cars as solid circles
points(x=Cars93$Horsepower[index3], y=Cars93$MPG.city[index3], pch=16)
# Add car names, offset from points, with larger bold text
text(x = Cars93$Horsepower[index3],
y = Cars93$MPG.city[index3],
labels = Cars93$Make[index3], adj = -0.2, cex=1.2, font=4)
# Plot Gas vs. Temp as solid triangles
data(whiteside, package="MASS")
plot(x=whiteside$Temp, y=whiteside$Gas, pch=17)
# Create indexB, pointing to "Before" data
indexB <- which(whiteside$Insul == "Before")
# Create indexA, pointing to "After" data
indexA <- which(whiteside$Insul == "After")
# Add "Before" text in blue, rotated 30 degrees, 80% size
text(x = whiteside$Temp[indexB], y = whiteside$Gas[indexB],
labels = "Before", col = "blue", srt = 30, cex = 0.8)
# Add "After" text in red, rotated -20 degrees, 80% size
text(x = whiteside$Temp[indexA], y = whiteside$Gas[indexA],
labels = "After", col = "red", srt = -20, cex = 0.8)
# Set up and label empty plot of Gas vs. Temp
data(whiteside, package="MASS")
plot(x=whiteside$Temp, y=whiteside$Gas,
type = "n", xlab = "Outside temperature",
ylab = "Heating gas consumption")
# Create indexB, pointing to "Before" data
indexB <- which(whiteside$Insul == "Before")
# Create indexA, pointing to "After" data
indexA <- which(whiteside$Insul == "After")
# Add "Before" data as solid triangles
points(x=whiteside$Temp[indexB], y=whiteside$Gas[indexB], pch=17)
# Add "After" data as open circles
points(x=whiteside$Temp[indexA], y=whiteside$Gas[indexA], pch=1)
# Add legend that identifies points as "Before" and "After"
legend("topright", pch = c(17, 1),
legend = c("Before", "After"))
# Create a boxplot of sugars by shelf value, without axes
data(UScereal, package="MASS")
boxplot(sugars ~ shelf, data=UScereal, axes=FALSE)
# Add a default y-axis to the left of the boxplot
axis(side = 2)
# Add an x-axis below the plot, labelled 1, 2, and 3
axis(side = 1, at = c(1, 2, 3))
# Add a second x-axis above the plot
axis(side = 3, at = c(1, 2, 3),
labels = c("floor", "middle", "top"))
# Create a scatterplot of MPG.city vs. Horsepower
data(Cars93, package="MASS")
plot(x=Cars93$Horsepower, y=Cars93$MPG.city)
# Call supsmu() to generate a smooth trend curve, with default bass
trend1 <- supsmu(x=Cars93$Horsepower, y=Cars93$MPG.city)
# Add this trend curve to the plot
lines(trend1)
# Call supsmu() for a second trend curve, with bass = 10
trend2 <- supsmu(x=Cars93$Horsepower, y=Cars93$MPG.city, bass=10)
# Add this trend curve as a heavy, dotted line
lines(trend2, lty=3, lwd=2)
Visual complexity – too much detail, or too many arrayed plots, defeat the goals of explanatory analysis:
Multiple plot arrays (such as through par(mfrow)) can help show much detail:
Plot arrays can also be setup (more work, but more flexibility) using the layout() function:
Example code includes:
# Compute the number of plots to be displayed
data(Cars93, package="MASS")
ncol(Cars93)^2
## [1] 729
# Plot the array of scatterplots (not run - it is a mess, though!)
# plot(Cars93)
# Construct the vector keep_vars
keep_vars <- c("calories", "protein", "fat",
"fibre", "carbo", "sugars")
# Use keep_vars to extract the desired subset of UScereal
data(UScereal, package="MASS")
df <- UScereal[, keep_vars]
# Set up a two-by-two plot array
par(mfrow=c(2, 2))
# Use matplot() to generate an array of two scatterplots
matplot(df$calories, df[, c("protein", "fat")], xlab="calories", ylab="")
# Add a title
title("Two scatterplots")
# Use matplot() to generate an array of three scatterplots
matplot(df$calories, df[, c("protein", "fat", "fibre")], xlab="calories", ylab="")
# Add a title
title("Three scatterplots")
# Use matplot() to generate an array of four scatterplots
matplot(df$calories, df[, c("protein", "fat", "fibre", "carbo")], xlab="calories", ylab="")
# Add a title
title("Four scatterplots")
# Use matplot() to generate an array of five scatterplots
matplot(df$calories, df[, c("protein", "fat", "fibre", "carbo", "sugars")], xlab="calories", ylab="")
# Add a title
title("Five scatterplots")
par(mfrow=c(1, 1))
# Create mfr_table of manufacturer frequencies
data(Cars93, package="MASS")
mfr_table <- table(Cars93$Manufacturer)
## NEED TO COMMENT THESE OUT - DEPENDENCY NOT AVAILABLE
# Create the default wordcloud from this table
# wordcloud(words = names(mfr_table),
# freq = as.numeric(mfr_table),
# scale = c(2, 0.25)
# )
# Change the minimum word frequency
# wordcloud(words = names(mfr_table),
# freq = as.numeric(mfr_table),
# scale = c(2, 0.25),
# min.freq = 1
# )
# Create model_table of model frequencies
model_table <- table(Cars93$Model)
# Create the wordcloud of all model names with smaller scaling
# wordcloud(words = names(model_table),
# freq = as.numeric(model_table),
# scale = c(0.75, 0.25),
# min.freq = 1
# )
# Set up a two-by-two plot array
par(mfrow=c(2, 2))
# Plot y1 vs. x1
plot(anscombe$x1, anscombe$y1)
# Plot y2 vs. x2
plot(anscombe$x2, anscombe$y2)
# Plot y3 vs. x3
plot(anscombe$x3, anscombe$y3)
# Plot y4 vs. x4
plot(anscombe$x4, anscombe$y4)
par(mfrow=c(1, 1))
# Define common x and y limits for the four plots
xmin <- min(anscombe[, c("x1", "x2", "x3", "x4")])
xmax <- max(anscombe[, c("x1", "x2", "x3", "x4")])
ymin <- min(anscombe[, c("y1", "y2", "y3", "y4")])
ymax <- max(anscombe[, c("y1", "y2", "y3", "y4")])
# Set up a two-by-two plot array
par(mfrow=c(2, 2))
# Plot y1 vs. x1 with common x and y limits, labels & title
plot(anscombe$x1, anscombe$y1,
xlim = c(xmin, xmax),
ylim = c(ymin, ymax),
xlab = "x value", ylab = "y value",
main = "First dataset")
# Do the same for the y2 vs. x2 plot
plot(anscombe$x2, anscombe$y2,
xlim = c(xmin, xmax),
ylim = c(ymin, ymax),
xlab = "x value", ylab = "y value",
main = "Second dataset")
# Do the same for the y3 vs. x3 plot
plot(anscombe$x3, anscombe$y3,
xlim = c(xmin, xmax),
ylim = c(ymin, ymax),
xlab = "x value", ylab = "y value",
main = "Third dataset")
# Do the same for the y4 vs. x4 plot
plot(anscombe$x4, anscombe$y4,
xlim = c(xmin, xmax),
ylim = c(ymin, ymax),
xlab = "x value", ylab = "y value",
main = "Fourth dataset")
par(mfrow=c(1, 1))
# Set up a two-by-two plot array
par(mfrow=c(2, 2))
# Plot the raw duration data
data(geyser, package="MASS")
plot(geyser$duration, main="Raw data")
# Plot the normalized histogram of the duration data
MASS::truehist(geyser$duration, main="Histogram")
# Plot the density of the duration data
plot(density(geyser$duration), main="Density")
# Construct the normal QQ-plot of the duration data
car::qqPlot(geyser$duration, main="QQ-plot")
par(mfrow=c(1, 1))
# Define row1, row2, row3 for plots 1, 2, and 3
row1 <- c(0, 1)
row2 <- c(2, 0)
row3 <- c(0, 3)
# Use the matrix function to combine these rows into a matrix
layoutMatrix <- matrix(c(row1, row2, row3),
byrow = TRUE, nrow = 3)
# Call the layout() function to set up the plot array
layout(layoutMatrix)
# Show where the three plots will go
layout.show(n=3)
# Set up the plot array
layout(layoutMatrix)
# Construct vectors indexB and indexA
data(whiteside, package="MASS")
indexB <- which(whiteside$Insul == "Before")
indexA <- which(whiteside$Insul == "After")
# Create plot 1 and add title
plot(whiteside$Temp[indexB], whiteside$Gas[indexB],
ylim = c(0, 8)
)
title("Before data only")
# Create plot 2 and add title
plot(whiteside$Temp, whiteside$Gas,
ylim = c(0, 8)
)
title("Complete dataset")
# Create plot 3 and add title
plot(whiteside$Temp[indexA], whiteside$Gas[indexA],
ylim = c(0, 8)
)
title("After data only")
# Create row1, row2, and layoutVector
row1 <- c(1, 0, 0)
row2 <- c(0, 2, 2)
layoutVector <- c(row1, row2, row2)
# Convert layoutVector into layoutMatrix
layoutMatrix <- matrix(layoutVector, byrow = TRUE, nrow = 3)
# Set up the plot array
layout(layoutMatrix)
# Plot scatterplot
data(Boston, package="MASS")
plot(Boston$rad, Boston$zn)
# Plot sunflower plot
sunflowerplot(Boston$rad, Boston$zn)
par(mfrow=c(1, 1))
Chapter 5 - Advanced Customization:
Most R plotting functions are called for their side-effects rather than their return values:
Using color effectively helps convey information to the reader
Other graphics systems in R can expand significantly on the base R capabilities
Example code includes:
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
# Create a table of Cylinders frequencies
data(Cars93, package="MASS")
tbl <- table(Cars93$Cylinders)
# Generate a horizontal barplot of these frequencies
mids <- barplot(tbl, horiz = TRUE,
col = "transparent",
names.arg = ""
)
# Add names labels with text()
text(20, mids, names(tbl))
# Add count labels with text()
text(35, mids, as.numeric(tbl))
# Call symbols() to create the default bubbleplot
symbols(Cars93$Horsepower, Cars93$MPG.city,
circles = Cars93$Cylinders
)
# Repeat, with the inches argument specified
symbols(Cars93$Horsepower, Cars93$MPG.city,
circles = Cars93$Cylinders,
inches=0.2
)
# Call png() with the name of the file we want to create
png("bubbleplot.png")
# Re-create the plot from the last exercise
symbols(Cars93$Horsepower, Cars93$MPG.city,
circles = Cars93$Cylinders,
inches = 0.2)
# Save our file and return to our interactive session
dev.off()
## png
## 2
# Verify that we have created the file
list.files(pattern = "png")
## [1] "bubbleplot.png" "population.png"
# Iliinsky and Steele color name vector
IScolors <- c("red", "green", "yellow", "blue",
"black", "white", "pink", "cyan",
"gray", "orange", "brown", "purple")
# Create the data for the barplot
barWidths <- c(rep(2, 6), rep(1, 6))
# Recreate the horizontal barplot with colored bars
barplot(rev(barWidths), horiz = TRUE,
col = rev(IScolors), axes = FALSE,
names.arg = rev(IScolors), las = 1)
# Iliinsky and Steele color name vector
IScolors <- c("red", "green", "yellow", "blue",
"black", "white", "pink", "cyan",
"gray", "orange", "brown", "purple")
# Create the colored bubbleplot
symbols(Cars93$Horsepower, Cars93$MPG.city,
circles = Cars93$Cylinders, inches = 0.2,
bg = IScolors[as.factor(Cars93$Cylinders)])
# Create a table of Cylinders by Origin
tbl <- table(Cars93$Cylinders, Cars93$Origin)
# Create the default stacked barplot
barplot(tbl)
# Enhance this plot with color
barplot(tbl, col=IScolors[1:6])
# Use the data() function to load the dataCar data frame
data(dataCar, package="insuranceData")
# Generate the default tableplot() display
tabplot::tableplot(dataCar)
# Use xyplot() to construct the conditional scatterplot
data(UScereal, package="MASS")
lattice::xyplot(UScereal$calories ~ UScereal$sugars | UScereal$shelf)
# Create the basic plot (not displayed): basePlot
basePlot <- ggplot(Cars93, aes(x = Horsepower, y = MPG.city))
# Display the basic scatterplot
basePlot +
geom_point()
# Color the points by Cylinders value
basePlot +
geom_point(col = IScolors[Cars93$Cylinders])
# Make the point sizes also vary with Cylinders value
basePlot +
geom_point(col = IScolors[Cars93$Cylinders],
size = as.numeric(Cars93$Cylinders))
Data visulaization is the combination of Statistics and Design:
The Anscombe plot examples show four different datasets explained by the identical linear model. This reinforces the importance of plotting the data prior to running analyses and drawing conclusions.
The “Grammar of Graphics” is a plotting framework based on the book by Leland Wilkinson, “Grammar of Graphics” 2(1999). The gist is that graphics are made of distinct layers of grammatical elements. Meaningful plots are created through aesthetic mapping.
Essential Grammatical Elements include:
The ggplot2 package was one of the first developed and designed by Hadley Wickham. It implements the “Grammar of Graphics” in R, for example with:
The Anscombe data is good to have plotted for reference:
library(ggplot2)
data(anscombe)
ansX <- with(anscombe, c(x1, x2, x3, x4))
ansY <- with(anscombe, c(y1, y2, y3, y4))
ansType <- rep(1:4, each=nrow(anscombe))
ansFrame <- data.frame(x=ansX, y=ansY, series=factor(ansType))
# ggplot example for Anscombe data
ggplot(ansFrame, aes(x=x, y=y)) +
geom_point() +
geom_smooth(method="lm", col="red", se=FALSE, fullrange=TRUE) +
facet_wrap(~ series, nrow=2)
As well, the basic example code from above is useful to explore:
data(iris)
ggplot(iris, aes(x=Sepal.Length, y=Sepal.Width)) +
geom_jitter(alpha = 0.6) +
facet_grid(. ~ Species) +
stat_smooth(method = "lm", se = FALSE, col="red")
Some additional basic ggplot syntax includes (cached, since plotting each point of the diamonds dataset is taxing for the graphics):
# Explore the mtcars data frame with str()
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# Execute the following command
ggplot(mtcars, aes(x = cyl, y = mpg)) +
geom_point()
# Change the command below so that cyl is treated as factor
ggplot(mtcars, aes(x = factor(cyl), y = mpg)) +
geom_point()
# A scatter plot has been made for you
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point()
# Replace ___ with the correct vector
ggplot(mtcars, aes(x = wt, y = mpg, col = disp)) +
geom_point()
# Replace ___ with the correct vector
ggplot(mtcars, aes(x = wt, y = mpg, size = disp)) +
geom_point()
# Explore the diamonds data frame with str()
data(diamonds)
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
# Add geom_point() with +
ggplot(diamonds, aes(x = carat, y = price)) + geom_point()
# Add geom_point() and geom_smooth() with +
ggplot(diamonds, aes(x = carat, y = price)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam'
# The plot you created in the previous exercise
ggplot(diamonds, aes(x = carat, y = price)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam'
# Copy the above command but show only the smooth line
ggplot(diamonds, aes(x = carat, y = price)) +
geom_smooth()
## `geom_smooth()` using method = 'gam'
# Copy the above command and assign the correct value to col in aes()
ggplot(diamonds, aes(x = carat, y = price, col=clarity)) +
geom_smooth()
## `geom_smooth()` using method = 'gam'
# Keep the color settings from previous command. Plot only the points with argument alpha.
ggplot(diamonds, aes(x = carat, y = price, col=clarity)) +
geom_point(alpha = 0.4)
# Create the object containing the data and aes layers: dia_plot
dia_plot <- ggplot(diamonds, aes(x = carat, y=price))
# Add a geom layer with + and geom_point()
dia_plot + geom_point()
# Add the same geom layer, but with aes() inside
dia_plot + geom_point(aes(col = clarity))
set.seed(1)
# The dia_plot object has been created for you
dia_plot <- ggplot(diamonds, aes(x = carat, y = price))
# Expand dia_plot by adding geom_point() with alpha set to 0.2
dia_plot <- dia_plot + geom_point(alpha = 0.2)
# Plot dia_plot with additional geom_smooth() with se set to FALSE
dia_plot + geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam'
# Copy the command from above and add aes() with the correct mapping to geom_smooth()
dia_plot + geom_smooth(se = FALSE, aes(col = clarity))
## `geom_smooth()` using method = 'gam'
Data Layer - How data structure influences plots (ggplot2 vs. base):
Can add some additional points (similar to points() in base, but with axes rescaling for you):
Example code includes:
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# Plot the correct variables of mtcars
plot(mtcars$wt, mtcars$mpg, col=mtcars$cyl)
# Change cyl inside mtcars to a factor
mtcars$cyl <- as.factor(mtcars$cyl)
# Make the same plot as in the first instruction
plot(mtcars$wt, mtcars$mpg, col=mtcars$cyl)
# Use lm() to calculate a linear model and save it as carModel
carModel <- lm(mpg ~ wt, data = mtcars)
# Call abline() with carModel as first argument and set lty to 2
abline(carModel, lty=2)
# Plot each subset efficiently with lapply
# You don't have to edit this code
lapply(mtcars$cyl, function(x) {
abline(lm(mpg ~ wt, mtcars, subset = (cyl == x)), col = x)
})
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
##
## [[11]]
## NULL
##
## [[12]]
## NULL
##
## [[13]]
## NULL
##
## [[14]]
## NULL
##
## [[15]]
## NULL
##
## [[16]]
## NULL
##
## [[17]]
## NULL
##
## [[18]]
## NULL
##
## [[19]]
## NULL
##
## [[20]]
## NULL
##
## [[21]]
## NULL
##
## [[22]]
## NULL
##
## [[23]]
## NULL
##
## [[24]]
## NULL
##
## [[25]]
## NULL
##
## [[26]]
## NULL
##
## [[27]]
## NULL
##
## [[28]]
## NULL
##
## [[29]]
## NULL
##
## [[30]]
## NULL
##
## [[31]]
## NULL
##
## [[32]]
## NULL
# This code will draw the legend of the plot
# You don't have to edit this code
legend(x = 5, y = 33, legend = levels(mtcars$cyl),
col = 1:3, pch = 1, bty = "n")
# Plot 1: add geom_point() to this command to create a scatter plot
ggplot(mtcars, aes(x = wt, y = mpg, col = cyl)) +
geom_point() # Fill in using instructions Plot 1
# Plot 2: include the lines of the linear models, per cyl
ggplot(mtcars, aes(x = wt, y = mpg, col = cyl)) +
geom_point() + # Copy from Plot 1
geom_smooth(method="lm", se=FALSE) # Fill in using instructions Plot 2
# Plot 3: include a lm for the entire dataset in its whole
ggplot(mtcars, aes(x = wt, y = mpg, col = cyl)) +
geom_point() + # Copy from Plot 2
geom_smooth(method="lm", se=FALSE) + # Copy from Plot 2
geom_smooth(aes(group = 1), method="lm", se=FALSE, linetype = 2) # Fill in using instructions Plot 3
data(iris)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
# Option 1
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +
geom_point() +
geom_point(aes(x = Petal.Length, y = Petal.Width), col = "red")
# DS code to match up to lecturer data set formats
data(iris)
longIris <- tidyr::gather(iris, Type, Measure, -Species)
intIris <- tidyr::separate(longIris, Type, c("Part", "Metric"))
intIris$rowNum <- c(1:150, 1:150, 151:300, 151:300)
iris.wide <- tidyr::spread(intIris, Metric, Measure)
iris.tidy <- dplyr::select(dplyr::mutate(intIris, Value=Measure, Measure=Metric), Species, Part, Measure, Value)
# Option 2
ggplot(iris.wide, aes(x = Length, y = Width, col = Part)) +
geom_point()
# Consider the structure of iris, iris.wide and iris.tidy (in that order)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
str(iris.wide)
## 'data.frame': 300 obs. of 5 variables:
## $ Species: Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Part : chr "Petal" "Petal" "Petal" "Petal" ...
## $ rowNum : int 151 152 153 154 155 156 157 158 159 160 ...
## $ Length : num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
str(iris.tidy)
## 'data.frame': 600 obs. of 4 variables:
## $ Species: Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Part : chr "Sepal" "Sepal" "Sepal" "Sepal" ...
## $ Measure: chr "Length" "Length" "Length" "Length" ...
## $ Value : num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
# Think about which dataset you would use to get the plot shown right
# Fill in the ___ to produce the plot given to the right
ggplot(iris.tidy, aes(x = Species, y = Value, col = Part)) +
geom_jitter() +
facet_grid(. ~ Measure)
# Load the tidyr package
library(tidyr)
# Fill in the ___ to produce to the correct iris.tidy dataset
iris.tidy <- iris %>%
gather(key, Value, -Species) %>%
separate(key, c("Part", "Measure"), "\\.")
# Consider the head of iris, iris.wide and iris.tidy (in that order)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
head(iris.wide)
## Species Part rowNum Length Width
## 1 setosa Petal 151 1.4 0.2
## 2 setosa Petal 152 1.4 0.2
## 3 setosa Petal 153 1.3 0.2
## 4 setosa Petal 154 1.5 0.2
## 5 setosa Petal 155 1.4 0.2
## 6 setosa Petal 156 1.7 0.4
head(iris.tidy)
## Species Part Measure Value
## 1 setosa Sepal Length 5.1
## 2 setosa Sepal Length 4.9
## 3 setosa Sepal Length 4.7
## 4 setosa Sepal Length 4.6
## 5 setosa Sepal Length 5.0
## 6 setosa Sepal Length 5.4
# Think about which dataset you would use to get the plot shown right
# Fill in the ___ to produce the plot given to the right
ggplot(iris.wide, aes(x = Length, y = Width, col = Part)) +
geom_jitter() +
facet_grid(. ~ Species)
# Add column with unique ids (don't need to change)
iris$Flower <- 1:nrow(iris)
# Fill in the ___ to produce to the correct iris.wide dataset
iris.wide <- iris %>%
gather(key, value, -Species, -Flower) %>%
separate(key, c("Part", "Measure"), "\\.") %>%
spread(Measure, value)
Visible aesthetics are the cornerstone of the ggplot:
Modifying aesthetics:
Best practices for choosing among the aesthetics (though note that “there is a fair bit of creativity involved”):
Example code includes:
data(mtcars)
mtcars$cyl <- as.factor(mtcars$cyl)
mtcars$am <- as.factor(mtcars$am)
# Map cyl to y
ggplot(mtcars, aes(x=mpg, y=cyl)) + geom_point()
# Map cyl to x
ggplot(mtcars, aes(y=mpg, x=cyl)) + geom_point()
# Map cyl to col
ggplot(mtcars, aes(y=mpg, x=wt, col=cyl)) + geom_point()
# Change shape and size of the points in the above plot
ggplot(mtcars, aes(y=mpg, x=wt, col=cyl)) + geom_point(shape=1, size=4)
# Map cyl to fill
ggplot(mtcars, aes(x = wt, y = mpg, fill = cyl)) +
geom_point()
# Change shape, size and alpha of the points in the above plot
ggplot(mtcars, aes(x = wt, y = mpg, fill = cyl)) +
geom_point(shape=16, size=6, alpha=0.6)
# Map cyl to size
ggplot(mtcars, aes(y=mpg, x=wt, size=cyl)) + geom_point()
## Warning: Using size for a discrete variable is not advised.
# Map cyl to alpha
ggplot(mtcars, aes(y=mpg, x=wt, alpha=cyl)) + geom_point()
# Map cyl to shape
ggplot(mtcars, aes(y=mpg, x=wt, shape=cyl)) + geom_point()
# Map cyl to labels
ggplot(mtcars, aes(y=mpg, x=wt, label=cyl)) + geom_text()
# Define a hexadecimal color
my_color <- "#123456"
# Set the color aesthetic
ggplot(mtcars, aes(x=wt, y=mpg, col=cyl)) + geom_point()
# Set the color aesthetic and attribute
ggplot(mtcars, aes(x=wt, y=mpg, col=cyl)) + geom_point(col = my_color)
# Set the fill aesthetic and color, size and shape attributes
ggplot(mtcars, aes(x=wt, y=mpg, fill=cyl)) + geom_point(size=10, shape=23, col=my_color)
# Expand to draw points with alpha 0.5
ggplot(mtcars, aes(x = wt, y = mpg, fill = cyl)) + geom_point(alpha=0.5)
# Expand to draw points with shape 24 and color yellow
ggplot(mtcars, aes(x = wt, y = mpg, fill = cyl)) + geom_point(shape=24, col="yellow")
# Expand to draw text with label x, color red and size 10
ggplot(mtcars, aes(x = wt, y = mpg, fill = cyl)) + geom_text(label="x", col="red", size=10)
# Map mpg onto x, qsec onto y and factor(cyl) onto col
ggplot(mtcars, aes(x=mpg, y=qsec, col=factor(cyl))) + geom_point()
# Add mapping: factor(am) onto shape
ggplot(mtcars, aes(x=mpg, y=qsec, col=factor(cyl), shape=factor(am))) + geom_point()
# Add mapping: (hp/wt) onto size
ggplot(mtcars, aes(x=mpg, y=qsec, col=factor(cyl), shape=factor(am), size=(hp/wt))) + geom_point()
# Basic scatter plot: wt on x-axis and mpg on y-axis; map cyl to col
ggplot(mtcars, aes(x=wt, y=mpg, col=cyl)) + geom_point(size=4)
# Hollow circles - an improvement
ggplot(mtcars, aes(x=wt, y=mpg, col=cyl)) + geom_point(size=4, shape=1)
# Add transparency - very nice
ggplot(mtcars, aes(x=wt, y=mpg, col=cyl)) + geom_point(size=4, alpha=0.6)
Next, bar plots are examined using the same data:
cyl.am <- ggplot(mtcars, aes(x = factor(cyl), fill = factor(am)))
# Add geom (position = "stack" by default)
cyl.am + geom_bar()
# Fill - show proportion
cyl.am +
geom_bar(position = "fill")
# Dodging - principles of similarity and proximity
cyl.am +
geom_bar(position = "dodge")
# Clean up the axes with scale_ functions
val = c("#E41A1C", "#377EB8")
lab = c("Manual", "Automatic")
cyl.am +
geom_bar(position = "dodge") +
scale_x_discrete("Cylinders") +
scale_y_continuous("Number") +
scale_fill_manual("Transmission",
values = val,
labels = lab)
# Add a new column called group
mtcars$group <- 0
# Create jittered plot of mtcars: mpg onto x, group onto y
ggplot(mtcars, aes(x = mpg, y=group)) + geom_jitter()
# Change the y aesthetic limits
ggplot(mtcars, aes(x = mpg, y=group)) + geom_jitter() + scale_y_continuous(limits = c(-2, 2))
Further, the diamonds data set is explored to show techniques for minimizing over-plotting problems. Per previous, it is cached due to the lengthy plot times driven by the many data points:
data(diamonds)
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
# Scatter plot: carat (x), price (y), clarity (col)
ggplot(diamonds, aes(x=carat, y=price, col=clarity)) + geom_point()
# Adjust for overplotting
ggplot(diamonds, aes(x=carat, y=price, col=clarity)) + geom_point(alpha = 0.5)
# Scatter plot: clarity (x), carat (y), price (col)
ggplot(diamonds, aes(y=carat, x=clarity, col=price)) + geom_point(alpha = 0.5)
# Dot plot with jittering
ggplot(diamonds, aes(y=carat, x=clarity, col=price)) + geom_point(alpha = 0.5, position="jitter")
The geometries layer includes the most common plot types:
Scatter plots examples - geom_point(), geom_jitter(), geom_abline():
Bar plots examples - histogram, bar, errorbar:
Line plots examples - line:
Example code from mtcars includes:
# mtcars point plots
# Plot the cyl on the x-axis and wt on the y-axis
ggplot(mtcars, aes(x=cyl, y=wt)) + geom_point()
# Use geom_jitter() instead of geom_point()
ggplot(mtcars, aes(x=cyl, y=wt)) + geom_jitter()
# Define the position object using position_jitter(): posn.j
posn.j <- position_jitter(width = 0.1)
# Use posn.j in geom_point()
ggplot(mtcars, aes(x=cyl, y=wt)) + geom_point(position = posn.j)
# mtcars bar plots
# Make a univariate histogram
ggplot(mtcars, aes(x=mpg)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Change the bin width to 1
ggplot(mtcars, aes(x=mpg)) + geom_histogram(binwidth = 1)
# Change the y aesthetic to density
ggplot(mtcars, aes(x=mpg)) + geom_histogram(binwidth = 1, aes(y=..density..))
# Custom color code
myBlue <- "#377EB8"
# Change the fill color to myBlue
ggplot(mtcars, aes(x=mpg)) + geom_histogram(binwidth = 1, aes(y=..density..), fill=myBlue)
# Draw a bar plot of cyl, filled according to am
ggplot(mtcars, aes(x=cyl, fill=am)) + geom_bar()
# Change the position argument to stack
ggplot(mtcars, aes(x=cyl, fill=am)) + geom_bar(position="stack")
# Change the position argument to fill
ggplot(mtcars, aes(x=cyl, fill=am)) + geom_bar(position="fill")
# Change the position argument to dodge
ggplot(mtcars, aes(x=cyl, fill=am)) + geom_bar(position="dodge")
# Draw a bar plot of cyl, filled according to am
ggplot(mtcars, aes(x=cyl, fill=am)) + geom_bar()
# Change the position argument to "dodge"
ggplot(mtcars, aes(x=cyl, fill=am)) + geom_bar(position = "dodge")
# Define posn_d with position_dodge()
posn_d <- position_dodge(width=0.2)
# Change the position argument to posn_d
ggplot(mtcars, aes(x=cyl, fill=am)) + geom_bar(position = posn_d)
# Use posn_d as position and adjust alpha to 0.6
ggplot(mtcars, aes(x=cyl, fill=am)) + geom_bar(position = posn_d, alpha=0.6)
# A basic histogram, add coloring defined by cyl
ggplot(mtcars, aes(x=mpg, fill=cyl)) +
geom_histogram(binwidth = 1)
# Change position to identity
ggplot(mtcars, aes(x=mpg, fill=cyl)) +
geom_histogram(binwidth = 1, position="identity")
# Change geom to freqpoly (position is identity by default)
ggplot(mtcars, aes(x=mpg, col=cyl)) +
geom_freqpoly(binwidth = 1, position="identity")
# Example of how to use a brewed color palette
ggplot(mtcars, aes(x = cyl, fill = am)) +
geom_bar() +
scale_fill_brewer(palette = "Set1")
# Basic histogram plot command
ggplot(mtcars, aes(mpg)) +
geom_histogram(binwidth = 1)
# Expand the histogram to fill using am
ggplot(mtcars, aes(x=mpg, fill=am)) +
geom_histogram(binwidth = 1)
# Change the position argument to "dodge"
ggplot(mtcars, aes(x=mpg, fill=am)) +
geom_histogram(binwidth = 1, position="dodge")
# Change the position argument to "fill"
ggplot(mtcars, aes(x=mpg, fill=am)) +
geom_histogram(binwidth = 1, position="fill")
## Warning: Removed 16 rows containing missing values (geom_bar).
# Change the position argument to "identity" and set alpha to 0.4
ggplot(mtcars, aes(x=mpg, fill=am)) +
geom_histogram(binwidth = 1, position="identity", alpha = 0.4)
# Change fill to cyl
ggplot(mtcars, aes(x=mpg, fill=cyl)) +
geom_histogram(binwidth = 1, position="identity", alpha = 0.4)
Next, a few examples are run from dataset car::Vocab (cached due to plotting size/time):
Vocab <- car::Vocab
str(Vocab)
## 'data.frame': 21638 obs. of 4 variables:
## $ year : int 2004 2004 2004 2004 2004 2004 2004 2004 2004 2004 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 2 1 2 2 1 2 2 1 ...
## $ education : int 9 14 14 17 14 14 12 10 11 9 ...
## $ vocabulary: int 3 6 9 8 1 7 6 6 5 1 ...
# Basic scatter plot of vocabulary (y) against education (x). Use geom_point()
ggplot(Vocab, aes(x=education, y=vocabulary)) + geom_point()
# Use geom_jitter() instead of geom_point()
ggplot(Vocab, aes(x=education, y=vocabulary)) + geom_jitter()
# Using the above plotting command, set alpha to a very low 0.2
ggplot(Vocab, aes(x=education, y=vocabulary)) + geom_jitter(alpha = 0.2)
# Using the above plotting command, set the shape to 1
ggplot(Vocab, aes(x=education, y=vocabulary)) + geom_jitter(alpha = 0.2, shape=1)
# Plot education on x and vocabulary on fill
# Use the default brewed color palette
ggplot(Vocab, aes(x = education, fill = vocabulary)) +
geom_bar(position="fill") +
scale_fill_brewer()
# Definition of a set of blue colors
blues <- brewer.pal(9, "Blues")
# Make a color range using colorRampPalette() and the set of blues
blue_range <- colorRampPalette(blues)
# Use blue_range to adjust the color of the bars, use scale_fill_manual()
ggplot(Vocab, aes(x = education, fill = factor(vocabulary))) +
geom_bar(position = "fill") +
scale_fill_manual(values = blue_range(11))
Lastly, a few additional plots are displayed:
# Print out head of economics
data(economics)
head(economics)
## # A tibble: 6 × 6
## date pce pop psavert uempmed unemploy
## <date> <dbl> <int> <dbl> <dbl> <int>
## 1 1967-07-01 507.4 198712 12.5 4.5 2944
## 2 1967-08-01 510.5 198911 12.5 4.7 2945
## 3 1967-09-01 516.3 199113 11.7 4.6 2958
## 4 1967-10-01 512.9 199311 12.5 4.9 3143
## 5 1967-11-01 518.1 199498 12.5 4.7 3066
## 6 1967-12-01 525.8 199657 12.1 4.8 3018
# Plot unemploy as a function of date using a line plot
ggplot(economics, aes(x = date, y = unemploy)) + geom_line()
# Adjust plot to represent the fraction of total population that is unemployed
ggplot(economics, aes(x = date, y = unemploy/pop)) + geom_line()
recess <- data.frame(begin=as.Date(c(-31, 1400, 3652, 4199, 7486, 11382), origin="1970-01-01"), end=as.Date(c(304, 1885, 3834, 4687, 7729, 11627), origin="1970-01-01"))
ggplot(economics, aes(x = date, y = unemploy/pop)) +
geom_line() +
geom_rect(data=recess, inherit.aes=FALSE, aes(xmin=begin, xmax=end, ymin=-Inf, ymax=+Inf), fill="red", alpha=0.2)
# Cannot find dataset . . .
# Check the structure as a starting point
# str(fish.species)
# Use gather to go from fish.species to fish.tidy
# fish.tidy <- gather(fish.species, Species, Capture, -Year)
# Recreate the plot shown on the right
# ggplot(fish.tidy, aes(x = Year, y = Capture, col=Species)) + geom_line()
The qplot functionality is for making quick and dirty plots:
Basically, the qplot() is nice for just a quick and dirty analysis, though it will have much less flexibility on a go-forward basis.
Example code for qplot includes:
# The old way (shown)
plot(mpg ~ wt, data = mtcars)
# Using ggplot:
ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point()
# Using qplot:
qplot(wt, mpg, data=mtcars)
# Categorical:
# cyl
qplot(wt, mpg, data=mtcars, size=cyl)
## Warning: Using size for a discrete variable is not advised.
# gear
qplot(wt, mpg, data=mtcars, size=gear)
# Continuous
# hp
qplot(wt, mpg, data=mtcars, col=hp)
# qsec
qplot(wt, mpg, data=mtcars, col=qsec)
# qplot() with x only
qplot(factor(cyl), data=mtcars)
# qplot() with x and y
qplot(factor(cyl), factor(vs), data=mtcars)
# qplot() with geom set to jitter manually
qplot(factor(cyl), factor(vs), data=mtcars, geom="jitter")
# Make a dot plot with ggplot
ggplot(mtcars, aes(x=cyl, y=wt, fill = factor(am))) +
geom_dotplot(stackdir="center", binaxis="y")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
# qplot with geom "dotplot", binaxis = "y" and stackdir = "center"
qplot(cyl, wt, fill=factor(am), data=mtcars, geom="dotplot", binaxis="y", stackdir="center")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
Course #1 wrap-up comments:
A few wrap-up coding exercises for ggplot #1 include:
# Check out the head of ChickWeight
data(ChickWeight)
head(ChickWeight)
## Grouped Data: weight ~ Time | Chick
## weight Time Chick Diet
## 1 42 0 1 1
## 2 51 2 1 1
## 3 59 4 1 1
## 4 64 6 1 1
## 5 76 8 1 1
## 6 93 10 1 1
# Use ggplot() for the second instruction
ggplot(ChickWeight, aes(x=Time, y=weight)) + geom_line(aes(group=Chick))
# Use ggplot() for the third instruction
ggplot(ChickWeight, aes(x=Time, y=weight, col=Diet)) + geom_line(aes(group=Chick))
# Use ggplot() for the last instruction
ggplot(ChickWeight, aes(x=Time, y=weight, col=Diet)) + geom_line(aes(group=Chick), alpha=0.3) + geom_smooth(lwd=2, se=FALSE)
## `geom_smooth()` using method = 'loess'
# Check out the structure of titanic
library(titanic)
## Warning: package 'titanic' was built under R version 3.2.5
library(dplyr)
detach(package:dplyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
##
## first, last
## The following objects are masked from 'package:data.table':
##
## between, last
## The following object is masked from 'package:purrr':
##
## order_by
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
titanicFull <- titanic::titanic_train
str(titanicFull)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
titanic <- titanicFull %>%
select(Pclass, Sex, Survived, Age) %>%
filter(complete.cases(.))
str(titanic)
## 'data.frame': 714 obs. of 4 variables:
## $ Pclass : int 3 1 3 1 3 1 3 3 2 3 ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Survived: int 0 1 1 1 0 0 0 1 1 1 ...
## $ Age : num 22 38 26 35 35 54 2 27 14 4 ...
# Use ggplot() for the first instruction
ggplot(titanic, aes(x=factor(Pclass), fill=factor(Sex))) +
geom_bar(position = "dodge")
# Use ggplot() for the second instruction
ggplot(titanic, aes(x=factor(Pclass), fill=factor(Sex))) +
geom_bar(position = "dodge") +
facet_grid(. ~ Survived)
# Position jitter (use below)
posn.j <- position_jitter(0.5, 0)
# Use ggplot() for the last instruction
ggplot(titanic, aes(x=factor(Pclass), y=Age, col=factor(Sex))) +
geom_jitter(size=3, alpha=0.5, position=posn.j) +
facet_grid(. ~ Survived)
The second course expands on the remaining layers of ggplot2: Statistics, Coordinates, Facets, and Themes.
The statistics layer for ggplot2 has two basic components:
The statistics can also be called independently (outside the geom):
Example code from mtcars includes:
# Explore the mtcars data frame with str()
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# A scatter plot with LOESS smooth:
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess'
# A scatter plot with an ordinary Least Squares linear model:
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth(method = "lm")
# The previous plot, without CI ribbon:
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth(method = "lm", se=FALSE)
# The previous plot, without points:
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_smooth(method = "lm", se=FALSE)
# Define cyl as a factor variable
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE)
# Complete the following ggplot command as instructed
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
stat_smooth(method = "lm", se = FALSE, aes(group=1))
# Plot 1: change the LOESS span
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
# Add span below
geom_smooth(se = FALSE, span=0.7)
## `geom_smooth()` using method = 'loess'
# Plot 2: Set the overall model to LOESS and use a span of 0.7
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
# Change method and add span below
stat_smooth(method = "loess", aes(group = 1),
se = FALSE, col = "black", span=0.7)
# Plot 3: Set col to "All", inside the aes layer of stat_smooth()
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
stat_smooth(method = "loess",
# Add col inside aes()
aes(group = 1, col="All"),
# Remove the col argument below
se = FALSE, span = 0.7)
# Plot 4: Add scale_color_manual to change the colors
myColors <- c(brewer.pal(3, "Dark2"), "black")
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE, span = 0.75) +
stat_smooth(method = "loess",
aes(group = 1, col="All"),
se = F, span = 0.7) +
# Add correct arguments to scale_color_manual
scale_color_manual("Cylinders", values=myColors)
# Display structure of mtcars
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# Convert cyl and am to factors:
mtcars$cyl <- as.factor(mtcars$cyl)
mtcars$am <- as.factor(mtcars$am)
# Define positions:
posn.d <- position_dodge(width = 0.1)
posn.jd <- position_jitterdodge(jitter.width = 0.1, dodge.width = 0.2)
posn.j <- position_jitter(width = 0.2)
# base layers:
wt.cyl.am <- ggplot(mtcars, aes(x=cyl, y=wt, col=am, fill=am, group=am))
# Plot 1: Jittered, dodged scatter plot with transparent points
wt.cyl.am +
geom_point(position = posn.jd, alpha = 0.6)
# Plot 2: Mean and SD - the easy way
wt.cyl.am +
geom_point(position = posn.jd, alpha = 0.6) +
stat_summary(fun.data=mean_sdl, fun.args=list(mult=1), position=posn.d)
# Plot 3: Mean and 95% CI - the easy way
wt.cyl.am +
geom_point(position = posn.jd, alpha = 0.6) +
stat_summary(fun.data=mean_cl_normal, position=posn.d)
# Plot 4: Mean and SD - with T-tipped error bars - fill in ___
wt.cyl.am +
stat_summary(geom = "point", fun.y = mean,
position = posn.d) +
stat_summary(geom = "errorbar", fun.data = mean_sdl,
position = posn.d, fun.args = list(mult = 1), width = 0.1)
xx <- 1:100
# Function to save range for use in ggplot
gg_range <- function(x) {
# Change x below to return the instructed values
data.frame(ymin = min(x), # Min
ymax = max(x)
) # Max
}
gg_range(xx)
## ymin ymax
## 1 1 100
# Required output:
# ymin ymax
# 1 1 100
# Function to Custom function:
med_IQR <- function(x) {
# Change x below to return the instructed values
data.frame(y = median(x), # Median
ymin = quantile(x, 0.25), # 1st quartile
ymax = quantile(x, 0.75)
) # 3rd quartile
}
med_IQR(xx)
## y ymin ymax
## 25% 50.5 25.75 75.25
# Required output:
# y ymin ymax
# 25% 50.5 25.75 75.25
wt.cyl.am <- ggplot(mtcars, aes(x = cyl,y = wt, col = am, fill = am, group = am))
# Add three stat_summary calls to wt.cyl.am
wt.cyl.am +
stat_summary(geom = "linerange", fun.data = med_IQR,
position = posn.d, size = 3) +
stat_summary(geom = "linerange", fun.data = gg_range,
position = posn.d, size = 3,
alpha = 0.4) +
stat_summary(geom = "point", fun.y = median,
position = posn.d, size = 3,
col = "black", shape = "X")
Further examples (cached) from car::Vocab include:
Vocab <- car::Vocab
# Plot 1: Jittered scatter plot, add a linear model (lm) smooth:
ggplot(Vocab, aes(x = education, y = vocabulary)) +
geom_jitter(alpha = 0.2) +
stat_smooth(method="lm", se=FALSE)
# Plot 2: Only lm, colored by year
ggplot(Vocab, aes(x = education, y = vocabulary, col=factor(year))) +
stat_smooth(method="lm", se=FALSE)
# Plot 3: Set a color brewer palette
ggplot(Vocab, aes(x = education, y = vocabulary, col=factor(year))) +
stat_smooth(method="lm", se=FALSE) +
scale_color_brewer()
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors
# Plot 4: Add the group, specify alpha and size
ggplot(Vocab, aes(x = education, y = vocabulary, col = year, group=factor(year))) +
stat_smooth(method = "lm", se = FALSE, alpha=0.6, size=2) +
scale_color_gradientn(colors = brewer.pal(9,"YlOrRd"))
# Use stat_quantile instead of stat_smooth:
ggplot(Vocab, aes(x = education, y = vocabulary, col = year, group = factor(year))) +
stat_quantile(alpha = 0.6, size = 2) +
scale_color_gradientn(colors = brewer.pal(9,"YlOrRd"))
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
# Set quantile to 0.5:
ggplot(Vocab, aes(x = education, y = vocabulary, col = year, group = factor(year))) +
stat_quantile(alpha = 0.6, size = 2, quantiles=c(0.5)) +
scale_color_gradientn(colors = brewer.pal(9,"YlOrRd"))
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
# Plot with linear and loess model
p <- ggplot(Vocab, aes(x = education, y = vocabulary)) +
stat_smooth(method = "loess", aes(col = "red"), se = F) +
stat_smooth(method = "lm", aes(col = "blue"), se = F) +
scale_color_discrete("Model", labels = c("red" = "LOESS", "blue" = "lm"))
# Add stat_sum
p + stat_sum()
# Add stat_sum and set size range
p + stat_sum() + scale_size(range = c(1, 10))
The coordinates layers control the plot dimensions:
The facets are based on the concept of small multiples as per the Tufte book on “Visulaization of Quantitative Information” (1983):
Example code includes:
data(mtcars);
mtcars$cyl <- as.factor(mtcars$cyl);
mtcars$am <- as.factor(mtcars$am)
# Basic ggplot() command, coded for you
p <- ggplot(mtcars, aes(x = wt, y = hp, col = am)) + geom_point() + geom_smooth()
# Add scale_x_continuous
p + scale_x_continuous(limits = c(3,6), expand=c(0,0))
## `geom_smooth()` using method = 'loess'
## Warning: Removed 12 rows containing non-finite values (stat_smooth).
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 3.168
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4e-006
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 3.168
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.002
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 3.572
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4e-006
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4e-006
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## Warning: Removed 12 rows containing missing values (geom_point).
# The proper way to zoom in:
p + coord_cartesian(xlim=c(3, 6))
## `geom_smooth()` using method = 'loess'
data(iris)
# Complete basic scatter plot function
base.plot <- ggplot(iris, aes(x=Sepal.Length, y=Sepal.Width, col=Species)) +
geom_jitter() +
geom_smooth(method = "lm", se = FALSE)
# Plot base.plot: default aspect ratio
base.plot
# Fix aspect ratio (1:1) of base.plot
base.plot + coord_equal()
# Create stacked bar plot: thin.bar
thin.bar <- ggplot(mtcars, aes(x=1, fill=cyl)) +
geom_bar()
# Convert thin.bar to pie chart
thin.bar + coord_polar(theta = "y")
# Create stacked bar plot: wide.bar
wide.bar <- ggplot(mtcars, aes(x=1, fill=cyl)) +
geom_bar(width=1)
# Convert wide.bar to pie chart
wide.bar + coord_polar(theta="y")
# Basic scatter plot:
p <- ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point()
# Separate rows according to transmission type, am
p + facet_grid(am ~ .)
# Separate columns according to cylinders, cyl
p + facet_grid(. ~ cyl)
# Separate by both columns and rows
p + facet_grid(am ~ cyl)
# Code to create the cyl_am col and myCol vector
mtcars$cyl_am <- paste(mtcars$cyl, mtcars$am, sep = "_")
myCol <- rbind(brewer.pal(9, "Blues")[c(3,6,8)],
brewer.pal(9, "Reds")[c(3,6,8)])
# Basic scatter plot, add color scale:
ggplot(mtcars, aes(x = wt, y = mpg, col=cyl_am)) +
geom_point() + scale_color_manual(values = myCol)
# Facet according on rows and columns.
ggplot(mtcars, aes(x = wt, y = mpg, col=cyl_am)) +
geom_point() + scale_color_manual(values = myCol) +
facet_grid(gear ~ vs)
# Add more variables
ggplot(mtcars, aes(x = wt, y = mpg, col=cyl_am, size=disp)) +
geom_point() + scale_color_manual(values = myCol) +
facet_grid(gear ~ vs)
library(dplyr)
detach(package:dplyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
##
## first, last
## The following objects are masked from 'package:data.table':
##
## between, last
## The following object is masked from 'package:purrr':
##
## order_by
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mamsleep <- tidyr::gather(ggplot2::msleep %>%
mutate(total = sleep_total, rem=sleep_rem) %>%
select(vore, name, total, rem) %>%
filter(!is.na(total), !is.na(rem)),
sleep, time, -c(vore, name))
mamsleep$sleep <- factor(mamsleep$sleep, levels=c("total", "rem"))
str(mamsleep)
## Classes 'tbl_df', 'tbl' and 'data.frame': 122 obs. of 4 variables:
## $ vore : chr "omni" "herbi" "omni" "herbi" ...
## $ name : chr "Owl monkey" "Mountain beaver" "Greater short-tailed shrew" "Cow" ...
## $ sleep: Factor w/ 2 levels "total","rem": 1 1 1 1 1 1 1 1 1 1 ...
## $ time : num 17 14.4 14.9 4 14.4 8.7 10.1 5.3 9.4 10 ...
# Basic scatter plot
ggplot(mamsleep, aes(x=time, y=name, col=sleep)) + geom_point()
# Facet rows accoding to vore
ggplot(mamsleep, aes(x=time, y=name, col=sleep)) + geom_point() + facet_grid(vore ~ .)
# Specify scale and space arguments to free up rows
ggplot(mamsleep, aes(x=time, y=name, col=sleep)) + geom_point() +
facet_grid(vore ~ ., scale="free_y", space="free_y")
The themes layer controls all the “non-data ink” on your plot:
Often an objective to repeat the theme multiple times in the same presentation:
Example code includes:
# Rough re-engineer of what is pre-defined as z
data(mtcars)
mtcars$cyl <- factor(mtcars$cyl)
myColors <- c(brewer.pal(9, "Blues"))[c(5, 7, 9)]
origZ <- ggplot(mtcars, aes(x=wt, y=mpg, col=cyl)) +
geom_point(size=2) +
geom_smooth(method="lm", se=FALSE) +
facet_wrap(~ cyl, nrow=1) +
scale_x_continuous("Weight (lb/1000)") +
scale_y_continuous("Miles / (US) gallon") +
scale_color_manual("Cylinders", values=myColors)
z <- origZ
# Plot 1: change the plot background color to myPink:
myPink <- "#FEE0D2"
z + theme(plot.background = element_rect(fill = myPink))
# Plot 2: adjust the border to be a black line of size 3
z + theme(plot.background = element_rect(fill = myPink, color="black", size=3))
# Plot 3: set panel.background, legend.key, legend.background and strip.background to element_blank()
uniform_panels <- theme(panel.background = element_blank(),
legend.key = element_blank(),
legend.background=element_blank(),
strip.background = element_blank())
z <- z + theme(plot.background = element_rect(fill = myPink, color="black", size=3)) + uniform_panels
z
# Extend z with theme() function and three arguments
z <- z + theme(panel.grid=element_blank(), axis.line=element_line(color="black"),
axis.ticks=element_line(color="black")
)
z
# Extend z with theme() function and four arguments
myRed <- "#99000D"
z <- z + theme(strip.text = element_text(size=16, color=myRed),
axis.title.y=element_text(color=myRed, hjust=0, face="italic"),
axis.title.x=element_text(color=myRed, hjust=0, face="italic"),
axis.text=element_text(color="black")
)
z
# Move legend by position
z + theme(legend.position = c(0.85, 0.85))
# Change direction
z + theme(legend.direction = "horizontal")
# Change location by name
z + theme(legend.position = "bottom")
# Remove legend entirely
z + theme(legend.position = "none")
# Increase spacing between facets
library(grid)
z + theme(panel.margin.x = unit(2, "cm"))
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
## Warning: `panel.margin.x` is deprecated. Please use `panel.spacing.x`
## property instead
# Add code to remove any excess plot margin space
z + theme(panel.margin.x = unit(2, "cm"), plot.margin = unit(c(0,0,0,0), "cm"))
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
## Warning: `panel.margin.x` is deprecated. Please use `panel.spacing.x`
## property instead
# Make z2 the same as origZ
z2 <- origZ
# Theme layer saved as an object, theme_pink
theme_pink <- theme(panel.background = element_blank(),
legend.key = element_blank(),
legend.background = element_blank(),
strip.background = element_blank(),
plot.background = element_rect(fill = myPink, color = "black", size = 3),
panel.grid = element_blank(),
axis.line = element_line(color = "black"),
axis.ticks = element_line(color = "black"),
strip.text = element_text(size = 16, color = myRed),
axis.title.y = element_text(color = myRed, hjust = 0, face = "italic"),
axis.title.x = element_text(color = myRed, hjust = 0, face = "italic"),
axis.text = element_text(color = "black"),
legend.position = "none")
# Apply theme_pink to z2
z2 + theme_pink
# Change code so that old theme is saved as old
old <- theme_update(panel.background = element_blank(),
legend.key = element_blank(),
legend.background = element_blank(),
strip.background = element_blank(),
plot.background = element_rect(fill = myPink, color = "black", size = 3),
panel.grid = element_blank(),
axis.line = element_line(color = "black"),
axis.ticks = element_line(color = "black"),
strip.text = element_text(size = 16, color = myRed),
axis.title.y = element_text(color = myRed, hjust = 0, face = "italic"),
axis.title.x = element_text(color = myRed, hjust = 0, face = "italic"),
axis.text = element_text(color = "black"),
legend.position = "none")
# Display the plot z2
z2
# Restore the old plot
theme_set(old)
# Load ggthemes package
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.2.5
# Apply theme_tufte
z2 + theme_tufte()
# Apply theme_tufte, modified:
z2 + theme_tufte() +
theme(legend.position = c(0.9, 0.9), legend.title=element_text(face="italic", size=12), axis.title=element_text(face="bold", size=14))
Best practices for plotting and visulaization:
Recall that pie charts are just bar charts wrapped on to polar coordinates:
Suggestion that heat maps are “one of the least effective forms of communication”:
Example code includes:
data(mtcars)
mtcars$cyl <- factor(mtcars$cyl)
mtcars$am <- factor(mtcars$am)
m <- ggplot(mtcars, aes(x = cyl, y = wt))
# Draw dynamite plot
m +
stat_summary(fun.y = mean, geom = "bar", fill = "skyblue") +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", width = 0.1)
# Base layers
m <- ggplot(mtcars, aes(x = cyl, y = wt, col = am, fill = am))
# Plot 1: Draw dynamite plot
m +
stat_summary(fun.y = mean, geom = "bar") +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", width = 0.1)
# Plot 2: Set position dodge in each stat function
m +
stat_summary(fun.y = mean, geom = "bar", position = "dodge") +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1),
geom = "errorbar", width = 0.1, position = "dodge")
# Set your dodge posn manually
posn.d <- position_dodge(0.9)
# Plot 3: Redraw dynamite plot
m +
stat_summary(fun.y = mean, geom = "bar", position = posn.d) +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar",
width = 0.1, position = posn.d
)
cyl_means <- tapply(mtcars$wt, mtcars$cyl, FUN=mean)
cyl_sd <- tapply(mtcars$wt, mtcars$cyl, FUN=sd)
cyl_ct <- tapply(mtcars$wt, mtcars$cyl, FUN=length)
mtcars.cyl <- data.frame(cyl=as.factor(names(cyl_means)),
wt.avg=cyl_means, sd=cyl_sd, prop=cyl_ct/sum(cyl_ct)
)
# Base layers
m <- ggplot(mtcars.cyl, aes(x = cyl, y = wt.avg))
# Plot 1: Draw bar plot
m + geom_bar(stat = "identity", fill="skyblue")
# Plot 2: Add width aesthetic
m + geom_bar(stat = "identity", fill="skyblue", aes(width=prop))
## Warning: Ignoring unknown aesthetics: width
# Plot 3: Add error bars
m + geom_bar(stat = "identity", fill="skyblue", aes(width=prop)) +
geom_errorbar(aes(ymin = wt.avg - sd, ymax = wt.avg + sd), width=0.1)
## Warning: Ignoring unknown aesthetics: width
ggplot(mtcars, aes(x = factor(1), fill = am)) +
geom_bar(position = "fill", width=1) +
facet_grid(. ~ cyl) +
coord_polar(theta="y")
library(GGally)
## Warning: package 'GGally' was built under R version 3.2.5
## Warning: replacing previous import by 'utils::capture.output' when loading
## 'GGally'
## Warning: replacing previous import by 'utils::head' when loading 'GGally'
## Warning: replacing previous import by 'utils::installed.packages' when
## loading 'GGally'
## Warning: replacing previous import by 'utils::str' when loading 'GGally'
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
# All columns except am
group_by_am <- match("am", names(mtcars))
my_names_am <- (1:11)[-group_by_am]
# Basic parallel plot - each variable plotted as a z-score transformation
ggparcoord(mtcars, my_names_am, groupColumn = group_by_am, alpha = 0.8)
barley <- lattice::barley
str(barley)
## 'data.frame': 120 obs. of 4 variables:
## $ yield : num 27 48.9 27.4 39.9 33 ...
## $ variety: Factor w/ 10 levels "Svansota","No. 462",..: 3 3 3 3 3 3 7 7 7 7 ...
## $ year : Factor w/ 2 levels "1932","1931": 2 2 2 2 2 2 2 2 2 2 ...
## $ site : Factor w/ 6 levels "Grand Rapids",..: 3 6 4 5 1 2 3 6 4 5 ...
# Create color palette
myColors <- brewer.pal(9, "Reds")
# Build the heat map from scratch
ggplot(barley, aes(x=year, y=variety, fill=yield)) +
geom_tile() +
facet_wrap(~ site, ncol=1) +
scale_fill_gradientn(colors = myColors)
# Line plots
ggplot(barley, aes(x=year, y=yield, col=variety, group=variety)) +
geom_line() +
facet_wrap(~ site, nrow=1)
ggplot(barley, aes(x=year, y=yield, col=site, group=site, fill=site)) +
stat_summary(fun.y = mean, geom="line") +
stat_summary(fun.data = mean_sdl, fun.args=list(mult=1), geom="ribbon", col=NA, alpha=0.1)
CHIS (California Health Information Study) - Descriptive Statistics Case Study:
The mosaic plot is a sequence of rectangles, each sized based on the size of the group:
Case study code - creating a Merimeko plot:
# TBD - need the raw data first!
Advanced course that builds on concepts from the previous modules:
Refresher code from previous modules includes:
# Create movies_small
library(ggplot2movies)
## Warning: package 'ggplot2movies' was built under R version 3.2.5
set.seed(123)
movies_small <- movies[sample(nrow(movies), 1000), ]
movies_small$rating <- factor(round(movies_small$rating))
# Create movies_small
library(ggplot2movies)
set.seed(123)
movies_small <- movies[sample(nrow(movies), 1000), ]
movies_small$rating <- factor(round(movies_small$rating))
# Explore movies_small with str()
str(movies_small)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1000 obs. of 24 variables:
## $ title : chr "Fair and Worm-er" "Shelf Life" "House: After Five Years of Living" "Three Long Years" ...
## $ year : int 1946 2000 1955 2003 1963 1992 1999 1972 1994 1985 ...
## $ length : int 7 4 11 76 103 107 87 84 127 94 ...
## $ budget : int NA NA NA NA NA NA NA NA NA NA ...
## $ rating : Factor w/ 10 levels "1","2","3","4",..: 7 7 6 8 8 5 4 8 5 5 ...
## $ votes : int 16 11 15 11 103 28 105 9 37 28 ...
## $ r1 : num 0 0 14.5 4.5 4.5 4.5 14.5 0 4.5 4.5 ...
## $ r2 : num 0 0 0 0 4.5 0 4.5 0 4.5 0 ...
## $ r3 : num 0 0 4.5 4.5 0 4.5 4.5 0 14.5 4.5 ...
## $ r4 : num 0 0 4.5 0 4.5 4.5 4.5 0 4.5 14.5 ...
## $ r5 : num 4.5 4.5 0 0 4.5 0 4.5 14.5 24.5 4.5 ...
## $ r6 : num 4.5 24.5 34.5 4.5 4.5 0 14.5 0 4.5 14.5 ...
## $ r7 : num 64.5 4.5 24.5 0 14.5 4.5 14.5 14.5 14.5 14.5 ...
## $ r8 : num 14.5 24.5 4.5 4.5 14.5 24.5 14.5 24.5 14.5 14.5 ...
## $ r9 : num 0 0 0 14.5 14.5 24.5 14.5 14.5 4.5 4.5 ...
## $ r10 : num 14.5 24.5 14.5 44.5 44.5 24.5 14.5 44.5 4.5 24.5 ...
## $ mpaa : chr "" "" "" "" ...
## $ Action : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Animation : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Comedy : int 1 0 0 1 0 1 1 1 0 0 ...
## $ Drama : int 0 0 0 0 1 0 0 0 1 1 ...
## $ Documentary: int 0 0 1 0 0 0 0 0 0 0 ...
## $ Romance : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Short : int 1 1 1 0 0 0 0 0 0 0 ...
# Build a scatter plot with mean and 95% CI
ggplot(movies_small, aes(x = rating, y = votes)) +
geom_point() +
stat_summary(fun.data = "mean_cl_normal",
geom = "crossbar",
width = 0.2,
col = "red") +
scale_y_log10()
# Reproduce the plot
ggplot(diamonds, aes(x=carat, y=price, col=color)) +
geom_point(alpha=0.5, size=0.5, shape=16) +
scale_x_log10(limits=c(0.1, 10)) +
xlab(expression(log[10](Carat))) +
scale_y_log10(limits=c(100, 100000)) +
ylab(expression(log[10](Price))) +
scale_color_brewer(palette="YlOrRd") +
coord_equal() +
theme_classic()
# Add smooth layer and facet the plot
ggplot(diamonds, aes(x = carat, y = price, col = color)) +
geom_point(alpha = 0.5, size = .5, shape = 16) +
scale_x_log10(expression(log[10](Carat)), limits = c(0.1,10)) +
scale_y_log10(expression(log[10](Price)), limits = c(100,100000)) +
scale_color_brewer(palette = "YlOrRd") +
coord_equal() +
theme_classic() +
stat_smooth(method="lm") +
facet_grid(. ~ cut)
Box plots creation and usage (more for an academic audience). John Tukey - “Exploratory Data Analysis” (median, IQR, max, min, extremes outside of 1.5 * IQR). Whiskers are only drawn up to the “fence” (1.5 * IQR); everything above this is the outlier.
Example code includes:
# Add a boxplot geom
d <- ggplot(movies_small, aes(x = rating, y = votes)) +
geom_point() +
geom_boxplot() +
stat_summary(fun.data = "mean_cl_normal",
geom = "crossbar",
width = 0.2,
col = "red")
# Untransformed plot
d
# Transform the scale
d + scale_y_log10()
# Transform the coordinates (produces error in RStudio - commented out)
# d + coord_trans(y = "log10")
# Plot object p
p <- ggplot(diamonds, aes(x = carat, y = price))
# Use cut_interval
p + geom_boxplot(aes(group = cut_interval(carat, n=10)))
# Use cut_number
p + geom_boxplot(aes(group = cut_number(carat, n=10)))
# Use cut_width
p + geom_boxplot(aes(group = cut_width(carat, width=0.25)))
Density plots are a good way to describe univariate data. There are many statistics (e.g., PDF or Probability Density Function), including the “Kernel Density Estimate” (KDE) - “sum of bumps placed at the observations; kernel function determines the shape of the bumps, while window width, h, determines their width”:
Example code includes:
# Set up keys
t_norm <- c(-0.560475646552213,-0.23017748948328,1.55870831414912,0.070508391424576,0.129287735160946,1.71506498688328,0.460916205989202,-1.26506123460653,-0.686852851893526,-0.445661970099958,1.22408179743946,0.359813827057364,0.400771450594052,0.11068271594512,-0.555841134754075,1.78691313680308,0.497850478229239,-1.96661715662964,0.701355901563686,-0.472791407727934,-1.06782370598685,-0.217974914658295,-1.02600444830724,-0.72889122929114,-0.625039267849257,-1.68669331074241,0.837787044494525,0.153373117836515,-1.13813693701195,1.25381492106993,0.426464221476814,-0.295071482992271,0.895125661045022,0.878133487533042,0.821581081637487,0.688640254100091,0.553917653537589,-0.0619117105767217,-0.305962663739917,-0.380471001012383,-0.694706978920513,-0.207917278019599,-1.26539635156826,2.16895596533851,1.20796199830499,-1.12310858320335,-0.402884835299076,-0.466655353623219,0.779965118336318,-0.0833690664718293,0.253318513994755,-0.028546755348703,-0.0428704572913161,1.36860228401446,-0.225770985659268,1.51647060442954,-1.54875280423022,0.584613749636069,0.123854243844614,0.215941568743973,0.379639482759882,-0.502323453109302,-0.33320738366942,-1.01857538310709,-1.07179122647558,0.303528641404258,0.448209778629426,0.0530042267305041,0.922267467879738,2.05008468562714,-0.491031166056535,-2.30916887564081,1.00573852446226,-0.709200762582393,-0.688008616467358,1.0255713696967,-0.284773007051009,-1.22071771225454,0.18130347974915,-0.138891362439045,0.00576418589988693,0.38528040112633,-0.370660031792409,0.644376548518833,-0.220486561818751,0.331781963915697,1.09683901314935,0.435181490833803,-0.325931585531227,1.14880761845109,0.993503855962119,0.54839695950807,0.238731735111441,-0.627906076039371,1.36065244853001,-0.600259587147127,2.18733299301658,1.53261062618519,-0.235700359100477,-1.02642090030678,-0.710406563699301,0.25688370915653,-0.246691878462374,-0.347542599397733,-0.951618567265016,-0.0450277248089203,-0.784904469457076,-1.66794193658814,-0.380226520287762,0.918996609060766,-0.575346962608392,0.607964322225033,-1.61788270828916,-0.0555619655245394,0.519407203943462,0.301153362166714,0.105676194148943,-0.640706008305376,-0.849704346033582,-1.02412879060491,0.117646597100126,-0.947474614184802,-0.490557443700668,-0.256092192198247,1.84386200523221,-0.651949901695459,0.235386572284857,0.0779608495637108,-0.961856634130129,-0.0713080861235987,1.44455085842335,0.451504053079215,0.0412329219929399,-0.422496832339625,-2.05324722154052,1.13133721341418,-1.46064007092482,0.739947510877334,1.90910356921748,-1.4438931609718,0.701784335374711,-0.262197489402468,-1.57214415914549,-1.51466765378175,-1.60153617357459,-0.530906522170303,-1.4617555849959,0.687916772975828,2.10010894052567,-1.28703047603518,0.787738847475178,0.76904224100091,0.332202578950118,-1.00837660827701,-0.119452606630659,-0.280395335170247,0.56298953322048,-0.372438756103829,0.976973386685621,-0.374580857767014,1.05271146557933,-1.04917700666607,-1.26015524475811,3.2410399349424,-0.416857588160432,0.298227591540715,0.636569674033849,-0.483780625708744,0.516862044313609,0.368964527385086,-0.215380507641693,0.0652930335253153,-0.034067253738464,2.12845189901618,-0.741336096272828,-1.09599626707466,0.0377883991710788,0.310480749443137,0.436523478910183,-0.458365332711106,-1.06332613397119,1.26318517608949,-0.349650387953555,-0.865512862653374,-0.236279568941097,-0.197175894348552,1.10992028971364,0.0847372921971965,0.754053785184521,-0.499292017172261,0.214445309581601,-0.324685911490835,0.0945835281735714,-0.895363357977542,-1.31080153332797,1.99721338474797,0.600708823672418,-1.25127136162494,-0.611165916680421,-1.18548008459731)
t_bimodal <- c(0.19881034888372,-0.68758702356649,-2.26514505669635,-1.45680594076791,-2.41433994791886,-2.47624689461558,-2.78860283785024,-2.59461726745951,-0.34909253266331,-2.05402812508544,-1.88075476357242,-1.75631257040091,-0.767524121514662,-2.51606383094478,-2.99250715039204,-0.32430306759681,-2.44116321690529,-2.72306596993987,-3.23627311888329,-3.2847157223178,-2.57397347929799,-1.38201418283347,-0.890151861070282,-1.29241164616441,-2.36365729709525,-1.9402500626154,-2.70459646368007,-2.71721816157401,-1.11534950102308,-3.01559257860354,-0.0447060345075361,-2.09031959396585,-1.78546117337078,-2.73852770473957,-2.57438868976327,-3.31701613230524,-2.18292538837273,-1.58101759507554,-1.67569565583862,-2.78153648705475,-2.788621970854,-2.50219871834286,-0.503939330153649,-3.13730362066574,-2.1790515943802,-0.0976381783210729,-2.10097488532881,-3.35984070382139,-2.66476943527406,-1.51454002109512,-2.37560287166977,-2.56187636354978,-2.34391723412846,-1.90950335286078,-0.401491228854174,-2.08856511213888,-0.919200503848483,-1.36924588434943,-2.11363989550614,-3.5329020028906,-2.52111731755252,-2.48987045313847,-1.95284556723847,-0.699801322333179,0.293078973831094,-0.452418941016231,-2.13315096432894,-3.75652739555764,-2.38877986407174,-1.91079277692671,-1.15498699593256,-1.03747203151573,-1.31569057058354,-3.39527434979947,-1.15035695436664,-2.44655721642722,-1.82519729983874,-1.92544882282627,-1.57183323502949,-1.97532501717386,-3.66747509758566,-1.26350403522656,-1.61397343165032,-2.26565162527822,-1.88185548895332,-1.86596135463154,-1.778980531439,-0.359153834022514,-2.21905037893348,-1.83193461611534,-0.831616126930907,-0.945818976623081,-0.854736889619643,-2.57746800105956,0.00248273029282942,-1.93329912906982,-0.133148155293138,-3.35090268603071,-1.97901641364576,-0.750085429030784,1.2847578127772,1.24731103178226,1.06146129639311,0.947486720661263,1.5628404668196,2.33117917295898,-0.01421049792072,2.21198043337229,3.23667504641657,4.03757401824044,3.30117599220059,2.75677476379596,0.27326960088567,1.39849329199322,1.64795354341738,2.70352390275689,1.89432866599623,0.741351371939828,3.68443570809411,2.91139129179596,2.23743027249103,3.21810861032581,0.661225712765028,2.6608202977898,1.47708762368658,2.68374552185071,1.93917804533993,2.63296071303145,3.33551761505939,2.0072900903169,3.01755863695209,0.811565964852021,1.27839555956398,3.51921771138818,2.37738797302393,-0.0522228204337298,0.635962547917625,1.79921898441088,2.86577940433449,1.89811674428478,2.62418747202065,2.95900537778783,3.67105482886294,2.05601673327496,1.9480180938191,0.24676264085773,2.09932759408783,1.42814994210444,1.02599041719591,1.82009376895246,3.01494317274366,0.00725151131141755,1.57272071279457,2.11663728358271,1.10679242994505,2.33390294249923,2.41142992061573,1.96696384072401,-0.465898193760027,4.57145814586664,1.7947007425318,2.65119328158767,2.27376649103655,3.02467323481835,2.81765944637409,1.79020682877149,2.37816777220851,1.05459116887611,2.85692301089932,1.53896166111565,4.41677335378821,0.348951104311813,1.53601275703399,2.82537986275924,2.51013254687866,1.410518961485,1.00321925779248,2.1444757047107,1.98569258683309,0.20971876273594,2.03455106713385,2.19023031569246,2.17472639698184,0.944982957397319,2.47613327830263,3.37857013695924,2.45623640317981,0.864411529625657,1.5643545303081,2.34610361955361,1.35295436868173,-0.157646335015277,2.88425082002821,1.17052238837548,1.42643972923172,3.50390060900454,1.22585507039459,2.8457315401893,0.739317121181225,1.64545759692602)
t_uniform <- c(-0.117272665724158,-0.536618106998503,-1.51491178292781,-1.81202527787536,-0.948814783245325,1.87456467188895,-0.0460180705413222,-0.0887118810787797,0.995171524584293,0.670560925267637,-1.80233761295676,0.780420977622271,-0.546953733079135,1.53653442952782,1.10118891857564,-1.44318543467671,-0.819962915033102,-1.49566885828972,0.359606495127082,0.246702435426414,0.754884480498731,-0.754916763864458,0.422347365878522,1.96413727570325,0.972819660790265,-1.69657147862017,-0.195324374362826,-1.78585226181895,-0.641777943819761,0.935808595269918,-1.98357223812491,1.08763792924583,-0.148099155165255,0.883361400105059,0.666022868826985,0.288294882513583,0.815251800231636,0.628884241916239,-0.842591419816017,-1.61104217730463,1.84968528430909,0.945336116477847,0.450893874280155,-1.52028447668999,0.201036185957491,-0.948974888771772,1.5934433247894,-1.96328021679074,-1.05506025627255,-1.47982161864638,-0.695067413151264,0.90559555310756,1.96698094159365,0.86053411103785,0.0177592439576983,-0.255809312686324,1.79530100245029,-1.51927404943854,-1.69941929355264,1.55608597118407,-0.302191619761288,-1.8305572848767,0.5897685745731,-0.125523350201547,0.471704493276775,-0.916738269850612,-1.3708188533783,-1.54298291914165,0.0307314526289701,0.192129150032997,-1.43741524219513,-1.32092379964888,1.0479412926361,0.1095797624439,1.44395743031055,0.69421995151788,-1.94783588126302,0.77279560547322,1.56685492862016,0.52740070130676,-1.57082138024271,1.684260815382,0.701449524611235,-1.40562514960766,0.981367369182408,1.77039544750005,-0.316866497509181,-0.809107687324286,-0.962293319404125,-1.10847473237664,0.262617373839021,1.02660053130239,0.678414183668792,0.186091177165508,1.24585463106632,1.03666720818728,-1.9197261352092,-0.476385199464858,-1.79647954553366,1.19161174912006,1.69479685276747,0.170393481850624,1.40945840068161,0.33425145316869,0.673294574022293,0.0452583860605955,1.05100235715508,1.61344915162772,1.28189804870635,-1.71427260152996,-1.98441462777555,-1.79120553657413,1.46624071896076,0.304980675689876,-0.744629711844027,1.83786313515157,0.364775028079748,0.125637343153358,-0.464253313839436,-0.721787082031369,1.23354502115399,-1.83232200611383,-0.545029221102595,1.4263878678903,0.791786313988268,0.737945891916752,-0.607939789071679,0.218727317638695,-1.45102552976459,1.13972622528672,1.54745028633624,-1.18361646682024,1.08249184582382,0.385451843030751,1.83067896962166,-1.36524640955031,0.103897096589208,1.49260547012091,1.47882428113371,-1.90524542704225,1.90355877391994,-0.0391032071784139,-0.443318707868457,-0.329780160449445,-1.62829671800137,-1.35276315920055,-0.378334019333124,-0.632742223329842,-0.33897018712014,-0.783790123648942,0.241122149862349,-1.37650339771062,1.82631905656308,-1.82413349859416,-0.511369029060006,1.85046136565506,0.581710062921047,-1.75494954269379,-0.360216286033392,-0.296379463747144,0.0326323388144374,-0.201600356958807,0.493045534007251,-1.44008827582002,1.63178560324013,0.277773099020123,0.193122295662761,-1.53268950991333,1.04811332933605,-0.0865222131833434,1.12787565868348,-1.8158938055858,1.27937867119908,-0.922366210259497,-0.868598341941833,0.572886237874627,1.79247535113245,-1.97200618218631,-0.593529311940074,-0.323817100375891,-0.168492811731994,0.846770317293704,1.67939223069698,0.508442802354693,1.60872711334378,1.02931663673371,-1.44856653735042,-1.38698048796505,-1.2347512524575,-0.267259437590837,-1.65112279262394,-1.10487999580801,0.28859471809119,-0.399323286488652,0.261861522682011,1.31849551480263,0.568455277942121,-0.43400499317795,0.838319418951869,-1.56470370758325)
test_data <- data.frame(norm=t_norm, bimodal=t_bimodal, uniform=t_uniform)
small_data <- data.frame(x=c(-3.5, 0, 0.5, 6))
# Calculating density: d
d <- density(test_data$norm)
# Use which.max() to calculate mode
mode <- d$x[which.max(d$y)]
# Finish the ggplot call
ggplot(test_data, aes(x = norm)) +
geom_rug() +
geom_density() +
geom_vline(xintercept = mode, col = "red")
# Arguments you'll need later on
fun_args <- list(mean = mean(test_data$norm), sd = sd(test_data$norm))
# Finish the ggplot
ggplot(test_data, aes(x = norm)) +
geom_histogram(aes(y=..density..)) +
geom_density(col="red") +
stat_function(fun=dnorm, args=fun_args, col="blue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Get the bandwith
get_bw <- density(small_data$x)$bw
# Basic plotting object
p <- ggplot(small_data, aes(x = x)) +
geom_rug() +
coord_cartesian(ylim = c(0,0.5))
# Create three plots
p + geom_density()
p + geom_density(adjust=0.25)
# p + geom_density(bw = 0.25*get_bw) ** does not work with my version of R/ggplot2 **
# Create two plots
p + geom_density(kernel="r")
p + geom_density(kernel="e")
There are multiple options for handling multiple groups (levels within a factor) or variables:
Example code includes:
# Create the data
norm0 <- c(-0.56, -0.23, 1.56, 0.07, 0.13, 1.72, 0.46, -1.27, -0.69, -0.45, 1.22, 0.36, 0.4, 0.11, -0.56, 1.79, 0.5, -1.97, 0.7, -0.47, -1.07, -0.22, -1.03, -0.73, -0.63, -1.69, 0.84, 0.15, -1.14, 1.25, 0.43, -0.3, 0.9, 0.88, 0.82, 0.69, 0.55, -0.06, -0.31, -0.38, -0.69, -0.21, -1.27, 2.17, 1.21, -1.12, -0.4, -0.47, 0.78, -0.08, 0.25, -0.03, -0.04, 1.37, -0.23, 1.52, -1.55, 0.58, 0.12, 0.22, 0.38, -0.5, -0.33, -1.02, -1.07, 0.3, 0.45, 0.05, 0.92, 2.05, -0.49, -2.31, 1.01, -0.71, -0.69, 1.03, -0.28, -1.22, 0.18, -0.14, 0.01, 0.39, -0.37, 0.64, -0.22, 0.33, 1.1, 0.44, -0.33, 1.15, 0.99, 0.55, 0.24, -0.63, 1.36, -0.6, 2.19, 1.53, -0.24, -1.03, -0.71, 0.26, -0.25, -0.35, -0.95, -0.05, -0.78, -1.67, -0.38, 0.92, -0.58, 0.61, -1.62, -0.06, 0.52, 0.3, 0.11, -0.64, -0.85, -1.02, 0.12, -0.95, -0.49, -0.26, 1.84, -0.65, 0.24, 0.08, -0.96, -0.07, 1.44, 0.45, 0.04, -0.42, -2.05, 1.13, -1.46, 0.74, 1.91, -1.44, 0.7, -0.26, -1.57, -1.51, -1.6, -0.53, -1.46, 0.69, 2.1, -1.29, 0.79, 0.77, 0.33, -1.01, -0.12, -0.28, 0.56, -0.37, 0.98, -0.37, 1.05, -1.05, -1.26, 3.24, -0.42, 0.3, 0.64, -0.48, 0.52, 0.37, -0.22, 0.07, -0.03, 2.13, -0.74, -1.1, 0.04, 0.31, 0.44, -0.46, -1.06, 1.26, -0.35, -0.87, -0.24, -0.2, 1.11, 0.08, 0.75, -0.5, 0.21, -0.32, 0.09, -0.9, -1.31, 2, 0.6, -1.25, -0.61, -1.19)
bimodal0 <- c(0.2, -0.69, -2.27, -1.46, -2.41, -2.48, -2.79, -2.59, -0.35, -2.05, -1.88, -1.76, -0.77, -2.52, -2.99, -0.32, -2.44, -2.72, -3.24, -3.28, -2.57, -1.38, -0.89, -1.29, -2.36, -1.94, -2.7, -2.72, -1.12, -3.02, -0.04, -2.09, -1.79, -2.74, -2.57, -3.32, -2.18, -1.58, -1.68, -2.78, -2.79, -2.5, -0.5, -3.14, -2.18, -0.1, -2.1, -3.36, -2.66, -1.51, -2.38, -2.56, -2.34, -1.91, -0.4, -2.09, -0.92, -1.37, -2.11, -3.53, -2.52, -2.49, -1.95, -0.7, 0.29, -0.45, -2.13, -3.76, -2.39, -1.91, -1.15, -1.04, -1.32, -3.4, -1.15, -2.45, -1.83, -1.93, -1.57, -1.98, -3.67, -1.26, -1.61, -2.27, -1.88, -1.87, -1.78, -0.36, -2.22, -1.83, -0.83, -0.95, -0.85, -2.58, 0, -1.93, -0.13, -3.35, -1.98, -0.75, 1.28, 1.25, 1.06, 0.95, 1.56, 2.33, -0.01, 2.21, 3.24, 4.04, 3.3, 2.76, 0.27, 1.4, 1.65, 2.7, 1.89, 0.74, 3.68, 2.91, 2.24, 3.22, 0.66, 2.66, 1.48, 2.68, 1.94, 2.63, 3.34, 2.01, 3.02, 0.81, 1.28, 3.52, 2.38, -0.05, 0.64, 1.8, 2.87, 1.9, 2.62, 2.96, 3.67, 2.06, 1.95, 0.25, 2.1, 1.43, 1.03, 1.82, 3.01, 0.01, 1.57, 2.12, 1.11, 2.33, 2.41, 1.97, -0.47, 4.57, 1.79, 2.65, 2.27, 3.02, 2.82, 1.79, 2.38, 1.05, 2.86, 1.54, 4.42, 0.35, 1.54, 2.83, 2.51, 1.41, 1, 2.14, 1.99, 0.21, 2.03, 2.19, 2.17, 0.94, 2.48, 3.38, 2.46, 0.86, 1.56, 2.35, 1.35, -0.16, 2.88, 1.17, 1.43, 3.5, 1.23, 2.85, 0.74, 1.65)
value2 <- c(-0.56, -0.23, 1.559, 0.071, 0.129, 1.715, 0.461, -1.265, -0.687, -0.446, 1.224, 0.36, 0.401, 0.111, -0.556, 1.787, 0.498, -1.967, 0.701, -0.473, -1.068, -0.218, -1.026, -0.729, -0.625, -1.687, 0.838, 0.153, -1.138, 1.254, 0.426, -0.295, 0.895, 0.878, 0.822, 0.689, 0.554, -0.062, -0.306, -0.38, -0.695, -0.208, -1.265, 2.169, 1.208, -1.123, -0.403, -0.467, 0.78, -0.083, 0.253, -0.029, -0.043, 1.369, -0.226, 1.516, -1.549, 0.585, 0.124, 0.216, 0.38, -0.502, -0.333, -1.019, -1.072, 0.304, 0.448, 0.053, 0.922, 2.05, -0.491, -2.309, 1.006, -0.709, -0.688, 1.026, -0.285, -1.221, 0.181, -0.139, 0.006, 0.385, -0.371, 0.644, -0.22, 0.332, 1.097, 0.435, -0.326, 1.149, 0.994, 0.548, 0.239, -0.628, 1.361, -0.6, 2.187, 1.533, -0.236, -1.026, -0.71, 0.257, -0.247, -0.348, -0.952, -0.045, -0.785, -1.668, -0.38, 0.919, -0.575, 0.608, -1.618, -0.056, 0.519, 0.301, 0.106, -0.641, -0.85, -1.024, 0.118, -0.947, -0.491, -0.256, 1.844, -0.652, 0.235, 0.078, -0.962, -0.071, 1.445, 0.452, 0.041, -0.422, -2.053, 1.131, -1.461, 0.74, 1.909, -1.444, 0.702, -0.262, -1.572, -1.515, -1.602, -0.531, -1.462, 0.688, 2.1, -1.287, 0.788, 0.769, 0.332, -1.008, -0.119, -0.28, 0.563, -0.372, 0.977, -0.375, 1.053, -1.049, -1.26, 3.241, -0.417, 0.298, 0.637, -0.484, 0.517, 0.369, -0.215, 0.065, -0.034, 2.128, -0.741, -1.096, 0.038, 0.31, 0.437, -0.458, -1.063, 1.263, -0.35, -0.866, -0.236, -0.197, 1.11, 0.085, 0.754, -0.499, 0.214, -0.325, 0.095, -0.895, -1.311, 1.997, 0.601, -1.251, -0.611, -1.185, 0.199, -0.688, -2.265, -1.457, -2.414, -2.476, -2.789, -2.595, -0.349, -2.054, -1.881, -1.756, -0.768, -2.516, -2.993, -0.324, -2.441, -2.723, -3.236, -3.285, -2.574, -1.382, -0.89, -1.292, -2.364, -1.94, -2.705, -2.717, -1.115, -3.016, -0.045, -2.09, -1.785, -2.739, -2.574, -3.317, -2.183, -1.581, -1.676, -2.782, -2.789, -2.502, -0.504, -3.137, -2.179, -0.098, -2.101, -3.36, -2.665, -1.515, -2.376, -2.562, -2.344, -1.91, -0.401, -2.089, -0.919, -1.369, -2.114, -3.533, -2.521, -2.49, -1.953, -0.7, 0.293, -0.452, -2.133, -3.757, -2.389, -1.911, -1.155, -1.037, -1.316, -3.395, -1.15, -2.447, -1.825, -1.925, -1.572, -1.975, -3.667, -1.264, -1.614, -2.266, -1.882, -1.866, -1.779, -0.359, -2.219, -1.832, -0.832, -0.946, -0.855, -2.577, 0.002, -1.933, -0.133, -3.351, -1.979, -0.75, 1.285, 1.247, 1.061, 0.947, 1.563, 2.331, -0.014, 2.212, 3.237, 4.038, 3.301, 2.757, 0.273, 1.398, 1.648, 2.704, 1.894, 0.741, 3.684, 2.911, 2.237, 3.218, 0.661, 2.661, 1.477, 2.684, 1.939, 2.633, 3.336, 2.007, 3.018, 0.812, 1.278, 3.519, 2.377, -0.052, 0.636, 1.799, 2.866, 1.898, 2.624, 2.959, 3.671, 2.056, 1.948, 0.247, 2.099, 1.428, 1.026, 1.82, 3.015, 0.007, 1.573, 2.117, 1.107, 2.334, 2.411, 1.967, -0.466, 4.571, 1.795, 2.651, 2.274, 3.025, 2.818, 1.79, 2.378, 1.055, 2.857, 1.539, 4.417, 0.349, 1.536, 2.825, 2.51, 1.411, 1.003, 2.144, 1.986, 0.21, 2.035, 2.19, 2.175, 0.945, 2.476, 3.379, 2.456, 0.864, 1.564, 2.346, 1.353, -0.158, 2.884, 1.171, 1.426, 3.504, 1.226, 2.846, 0.739, 1.645)
dist2 <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)
test_data <- data.frame(norm=norm0, bimodal=bimodal0)
test_data2 <- data.frame(dist=factor(dist2, labels=c("norm", "bimodal")), value=value2)
str(test_data)
## 'data.frame': 200 obs. of 2 variables:
## $ norm : num -0.56 -0.23 1.56 0.07 0.13 1.72 0.46 -1.27 -0.69 -0.45 ...
## $ bimodal: num 0.2 -0.69 -2.27 -1.46 -2.41 -2.48 -2.79 -2.59 -0.35 -2.05 ...
str(test_data2)
## 'data.frame': 400 obs. of 2 variables:
## $ dist : Factor w/ 2 levels "norm","bimodal": 1 1 1 1 1 1 1 1 1 1 ...
## $ value: num -0.56 -0.23 1.559 0.071 0.129 ...
# Plot with test_data
ggplot(test_data, aes(x = norm)) +
geom_rug() +
geom_density()
# Plot two distributions with test_data2
ggplot(test_data2, aes(x = value, fill = dist, col = dist)) +
geom_rug(alpha=0.6) +
geom_density(alpha=0.6)
data(msleep); mammals <- msleep[!is.na(msleep$vore), c("vore", "sleep_total")]
# Individual densities
ggplot(mammals[mammals$vore == "insecti", ], aes(x = sleep_total, fill = vore)) +
geom_density(col = NA, alpha = 0.35) +
scale_x_continuous(limits = c(0, 24)) +
coord_cartesian(ylim = c(0, 0.3))
# With faceting
ggplot(mammals, aes(x = sleep_total, fill = vore)) +
geom_density(col = NA, alpha = 0.35) +
scale_x_continuous(limits = c(0, 24)) +
coord_cartesian(ylim = c(0, 0.3)) +
facet_wrap(~ vore, nrow=2)
# Note that by default, the x ranges fill the scale
ggplot(mammals, aes(x = sleep_total, fill = vore)) +
geom_density(col = NA, alpha = 0.35) +
scale_x_continuous(limits = c(0, 24)) +
coord_cartesian(ylim = c(0, 0.3))
# Trim each density plot individually
ggplot(mammals, aes(x = sleep_total, fill = vore)) +
geom_density(col = NA, alpha = 0.35, trim=TRUE) +
scale_x_continuous(limits=c(0,24)) +
coord_cartesian(ylim = c(0, 0.3))
# Density plot from before
ggplot(mammals, aes(x = sleep_total, fill = vore)) +
geom_density(col = NA, alpha = 0.35) +
scale_x_continuous(limits = c(0, 24)) +
coord_cartesian(ylim = c(0, 0.3))
# Finish the dplyr command
library(dplyr)
mammals2 <- mammals %>%
group_by(vore) %>%
mutate(n=n()/nrow(mammals))
# Density plot, weighted
ggplot(mammals2, aes(x = sleep_total, fill = vore)) +
geom_density(aes(weight = n), col=NA, alpha = 0.35) +
scale_x_continuous(limits = c(0, 24)) +
coord_cartesian(ylim = c(0, 0.3))
## Warning in density.default(x, weights = w, bw = bw, adjust = adjust, kernel
## = kernel, : sum(weights) != 1 -- will not get true density
## Warning in density.default(x, weights = w, bw = bw, adjust = adjust, kernel
## = kernel, : sum(weights) != 1 -- will not get true density
## Warning in density.default(x, weights = w, bw = bw, adjust = adjust, kernel
## = kernel, : sum(weights) != 1 -- will not get true density
## Warning in density.default(x, weights = w, bw = bw, adjust = adjust, kernel
## = kernel, : sum(weights) != 1 -- will not get true density
# Violin plot
ggplot(mammals, aes(x = vore, y = sleep_total, fill = vore)) +
geom_violin()
# Violin plot, weighted
ggplot(mammals2, aes(x = vore, y = sleep_total, fill = vore)) +
geom_violin(aes(weight = n), col=NA)
## Warning in density.default(x, weights = w, bw = bw, adjust = adjust, kernel
## = kernel, : sum(weights) != 1 -- will not get true density
## Warning in density.default(x, weights = w, bw = bw, adjust = adjust, kernel
## = kernel, : sum(weights) != 1 -- will not get true density
## Warning in density.default(x, weights = w, bw = bw, adjust = adjust, kernel
## = kernel, : sum(weights) != 1 -- will not get true density
## Warning in density.default(x, weights = w, bw = bw, adjust = adjust, kernel
## = kernel, : sum(weights) != 1 -- will not get true density
data(faithful)
# Base layers
p <- ggplot(faithful, aes(x = waiting, y = eruptions)) +
scale_y_continuous(limits = c(1, 5.5), expand = c(0, 0)) +
scale_x_continuous(limits = c(40, 100), expand = c(0, 0)) +
coord_fixed(60 / 4.5)
# Use geom_density_2d()
p + geom_density_2d()
# Use stat_density_2d()
p + stat_density_2d(h=c(5, 0.5), aes(col=..level..))
# Load in the viridis package
library(viridis)
## Warning: package 'viridis' was built under R version 3.2.5
# Load in the viridis package
library(viridis)
# Add viridis color scale
ggplot(faithful, aes(x = waiting, y = eruptions)) +
scale_y_continuous(limits = c(1, 5.5), expand = c(0,0)) +
scale_x_continuous(limits = c(40, 100), expand = c(0,0)) +
coord_fixed(60/4.5) +
stat_density_2d(geom = "tile", aes(fill = ..density..), h=c(5,.5), contour = FALSE) +
scale_fill_viridis()
Defining largeness of a dataset - # observations, # variables, etc. For many observations (e.g., “diamonds” dataset) - can adjust point size, alpha blending, 2-d contours, etc.:
For many variables, can use pre-processing such as PCA:
Ternary plot - triangle plot where all the components add to 100% (e.g., soil composition):
Network plots (e.g., blood type donors):
Diagnostic plots:
Example code includes:
# pairs
data(iris)
pairs(iris[, 1:4])
# chart.Correlation
library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 3.2.5
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
chart.Correlation(iris[, 1:4])
## Warning in plot.window(...): "method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "method" is not a graphical parameter
## Warning in title(...): "method" is not a graphical parameter
## Warning in plot.window(...): "method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "method" is not a graphical parameter
## Warning in title(...): "method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "method" is
## not a graphical parameter
## Warning in plot.window(...): "method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "method" is not a graphical parameter
## Warning in title(...): "method" is not a graphical parameter
## Warning in plot.window(...): "method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "method" is not a graphical parameter
## Warning in title(...): "method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "method" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "method" is
## not a graphical parameter
## Warning in plot.window(...): "method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "method" is not a graphical parameter
## Warning in title(...): "method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "method" is
## not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "method" is not a
## graphical parameter
## Warning in plot.window(...): "method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "method" is not a graphical parameter
## Warning in title(...): "method" is not a graphical parameter
## Warning in plot.window(...): "method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "method" is not a graphical parameter
## Warning in title(...): "method" is not a graphical parameter
## Warning in plot.window(...): "method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "method" is not a graphical parameter
## Warning in title(...): "method" is not a graphical parameter
## Warning in plot.window(...): "method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "method" is not a graphical parameter
## Warning in title(...): "method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "method" is not a
## graphical parameter
## Warning in plot.window(...): "method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "method" is not a graphical parameter
## Warning in title(...): "method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "method" is not a
## graphical parameter
## Warning in plot.window(...): "method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "method" is not a graphical parameter
## Warning in title(...): "method" is not a graphical parameter
## Warning in plot.window(...): "method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "method" is not a graphical parameter
## Warning in title(...): "method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "method" is
## not a graphical parameter
## Warning in plot.window(...): "method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "method" is not a graphical parameter
## Warning in title(...): "method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "method" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "method" is
## not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "method" is not a
## graphical parameter
## Warning in plot.window(...): "method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "method" is not a graphical parameter
## Warning in title(...): "method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "method" is not a
## graphical parameter
## Warning in plot.window(...): "method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "method" is not a graphical parameter
## Warning in title(...): "method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "method" is
## not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "method" is not a
## graphical parameter
## Warning in plot.window(...): "method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "method" is not a graphical parameter
## Warning in title(...): "method" is not a graphical parameter
# ggpairs
library(GGally)
data(mtcars); mtcars_fact <- mtcars; mtcars_fact$cyl <- as.factor(mtcars_fact$cyl)
ggpairs(mtcars_fact[, 1:3])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
cor_list <- function(x) {
L <- M <- cor(x)
M[lower.tri(M, diag = TRUE)] <- NA
M <- melt(M)
names(M)[3] <- "points"
L[upper.tri(L, diag = TRUE)] <- NA
L <- melt(L)
names(L)[3] <- "labels"
merge(M, L)
}
# Calculate xx with cor_list
library(dplyr)
xx <- iris %>%
group_by(Species) %>%
do(cor_list(.[1:4]))
# Finish the plot
ggplot(xx, aes(x = Var1, y = Var2)) +
geom_point(aes(col = points, size = abs(points)), shape = 16) +
geom_text(aes(col = labels, size = labels, label = round(labels, 2))) +
scale_size(range = c(0, 6)) +
scale_color_gradient("r", limits = c(-1, 1)) +
scale_y_discrete("", limits = rev(levels(xx$Var1))) +
scale_x_discrete("") +
guides(size = FALSE) +
geom_abline(slope = -1, intercept = nlevels(xx$Var1) + 1) +
coord_fixed() +
facet_grid(. ~ Species) +
theme(axis.text.y = element_text(angle = 45, hjust = 1),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.background = element_blank())
## Warning: Removed 30 rows containing missing values (geom_point).
## Warning: Removed 30 rows containing missing values (geom_text).
# Explore africa
library(GSIF)
## Warning: package 'GSIF' was built under R version 3.2.5
## GSIF version 0.5-3 (2016-07-18)
## URL: http://gsif.r-forge.r-project.org/
data(afsp)
str(afsp)
## List of 2
## $ sites :'data.frame': 26270 obs. of 9 variables:
## ..$ SOURCEID: Factor w/ 26270 levels "100902","100903",..: 5606 5607 5608 5609 5610 5611 5604 5605 5681 5682 ...
## ..$ SOURCEDB: Factor w/ 7 levels "Af_LDSF","Af_soilspec",..: 2 2 2 2 2 2 2 2 2 2 ...
## ..$ LONWGS84: num [1:26270] 36.1 36.1 36.1 36.1 36.1 ...
## ..$ LATWGS84: num [1:26270] -6.93 -6.93 -6.93 -6.93 -6.93 ...
## ..$ TIMESTRR: Date[1:26270], format: "2012-10-02" ...
## ..$ TAXGWRB : Factor w/ 40 levels "AC","AL","Alisol",..: NA NA NA NA NA NA NA NA NA NA ...
## ..$ TAXNUSDA: Factor w/ 960 levels "\"Plinthic\" Udoxic Dystropept",..: NA NA NA NA NA NA NA NA NA NA ...
## ..$ BDRICM : num [1:26270] NA NA NA NA NA NA NA NA NA NA ...
## ..$ DRAINFAO: Factor w/ 7 levels "E","I","M","P",..: NA NA NA NA NA NA NA NA NA NA ...
## $ horizons:'data.frame': 87693 obs. of 14 variables:
## ..$ SOURCEID: Factor w/ 26270 levels "100902","100903",..: 5606 5606 5607 5607 5607 5607 5608 5608 5609 5609 ...
## ..$ UHDICM : num [1:87693] 0 20 0 20 0 20 0 20 0 20 ...
## ..$ LHDICM : num [1:87693] 20 50 20 50 20 50 20 50 20 50 ...
## ..$ MCOMNS : Factor w/ 289 levels "10BG4/1","10R2.5/1",..: NA NA NA NA NA NA NA NA NA NA ...
## ..$ ORCDRC : num [1:87693] 4.36 3.69 3.67 4 5.25 4.57 6.02 3.6 3.33 5.33 ...
## ..$ PHIHOX : num [1:87693] 7.23 7.3 7.06 6.95 7.21 7.25 7.35 7.44 7.26 6.94 ...
## ..$ SNDPPT : num [1:87693] NA NA NA NA NA NA NA NA NA NA ...
## ..$ SLTPPT : num [1:87693] NA NA NA NA NA NA NA NA NA NA ...
## ..$ CLYPPT : num [1:87693] NA NA NA NA NA NA NA NA NA NA ...
## ..$ CRFVOL : num [1:87693] NA NA NA NA NA NA NA NA NA NA ...
## ..$ BLD : num [1:87693] NA NA NA NA NA NA NA NA NA NA ...
## ..$ CEC : num [1:87693] NA NA NA NA NA NA NA NA NA NA ...
## ..$ NTO : num [1:87693] 0.381 0.334 0.318 0.366 0.439 0.404 0.512 0.334 0.329 0.482 ...
## ..$ EMGX : num [1:87693] 2.66 2.95 2.73 3.84 2.36 3.16 3.59 4.37 3.71 2.76 ...
africa <- afsp$horizons[, c("SNDPPT", "SLTPPT", "CLYPPT")]
africa <- africa[complete.cases(africa), ]
africa <- africa / rowSums(africa)
africa <- africa %>% rename(Sand=SNDPPT, Silt=SLTPPT, Clay=CLYPPT)
str(africa)
## 'data.frame': 53830 obs. of 3 variables:
## $ Sand: num 0.845 0.838 0.694 0.661 0.647 ...
## $ Silt: num 0.0326 0.0313 0.0387 0.0782 0.1176 ...
## $ Clay: num 0.122 0.131 0.268 0.261 0.235 ...
# Sample the dataset
africa_sample <- africa[sample(1:nrow(africa), size = 50),]
# Add an ID column from the row.names
africa_sample$ID <- row.names(africa_sample)
# Gather africa_sample
library(tidyr)
africa_sample_tidy <- gather(africa_sample, key, value, -ID)
# Finish the ggplot command
ggplot(africa_sample_tidy, aes(x = factor(ID), y = value, fill = key)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_x_discrete(expand = c(0,0)) +
scale_y_continuous(expand = c(0,0)) +
labs(x = "Location", y = "Composition", fill = "Component") +
theme_minimal()
# Load ggtern
library(ggtern)
## Warning: package 'ggtern' was built under R version 3.2.5
## --
## Consider donating at: http://ggtern.com
## Even small amounts (say $10-50) are very much appreciated!
## Remember to cite, run citation(package = 'ggtern') for further info.
## --
##
## Attaching package: 'ggtern'
## The following objects are masked from 'package:ggplot2':
##
## %+%, aes, annotate, calc_element, ggplot, ggplot_build,
## ggplot_gtable, ggplotGrob, ggsave, layer_data, theme,
## theme_bw, theme_classic, theme_dark, theme_gray, theme_light,
## theme_linedraw, theme_minimal, theme_void
# Build ternary plot
# ggtern(africa, aes(x = Sand, y = Silt, z = Clay)) +
# geom_point(shape=16, alpha=0.2)
# ggtern and ggplot2 are loaded
# Plot 1
# ggtern(africa, aes(x = Sand, y = Silt, z = Clay)) +
# geom_density_tern()
# Plot 2
# ggtern(africa, aes(x = Sand, y = Silt, z = Clay)) +
# stat_density_tern(geom="polygon", aes(fill = ..level.., alpha = ..level..),
# guides(fill = guide_legend(show = FALSE))
# )
# Load geomnet
library(geomnet)
## Warning: package 'geomnet' was built under R version 3.2.5
# Examine structure of madmen
str(geomnet::madmen)
## List of 2
## $ edges :'data.frame': 39 obs. of 2 variables:
## ..$ Name1: Factor w/ 9 levels "Betty Draper",..: 1 1 2 2 2 2 2 2 2 2 ...
## ..$ Name2: Factor w/ 39 levels "Abe Drexler",..: 15 31 2 4 5 6 8 9 11 21 ...
## $ vertices:'data.frame': 45 obs. of 2 variables:
## ..$ label : Factor w/ 45 levels "Abe Drexler",..: 5 9 16 23 26 32 33 38 39 17 ...
## ..$ Gender: Factor w/ 2 levels "female","male": 1 2 2 1 2 1 2 2 2 2 ...
# Merge edges and vertices
mmnet <- merge(madmen$edges, madmen$vertices,
by.x = "Name1", by.y="label",
all = TRUE)
# Examin structure of mmnet
str(mmnet)
## 'data.frame': 75 obs. of 3 variables:
## $ Name1 : Factor w/ 45 levels "Betty Draper",..: 1 1 2 2 2 2 2 2 2 2 ...
## $ Name2 : Factor w/ 39 levels "Abe Drexler",..: 15 31 2 4 5 6 8 9 11 21 ...
## $ Gender: Factor w/ 2 levels "female","male": 1 1 2 2 2 2 2 2 2 2 ...
# geomnet is pre-loaded
# Merge edges and vertices
mmnet <- merge(madmen$edges, madmen$vertices,
by.x = "Name1", by.y = "label",
all = TRUE)
# Finish the ggplot command
# ggplot(data = mmnet, aes(from_id = Name1, to_id = Name2)) +
# geom_net(aes(col=Gender), size=6, linewidth=1, label=TRUE, fontsize=3, labelcolour="black")
# Merge edges and vertices
mmnet <- merge(madmen$edges, madmen$vertices,
by.x = "Name1", by.y = "label",
all = TRUE)
# Tweak the network plot
# ggplot(data = mmnet, aes(from_id = Name1, to_id = Name2)) +
# geom_net(aes(col = Gender),
# size = 6,
# linewidth = 1,
# label = TRUE,
# fontsize = 3,
# labelcolour = "black",
# directed=TRUE) +
# scale_color_manual(values = c("#FF69B4", "#0099ff")) +
# xlim(c(-0.05, 1.05)) +
# ggmap::theme_nothing(legend=TRUE) +
# theme(legend.key=element_blank())
# Merge edges and vertices
mmnet <- merge(madmen$edges, madmen$vertices,
by.x = "Name1", by.y = "label",
all = TRUE)
# Create linear model: res
data(trees)
res <- lm(Volume ~ Girth, data = trees)
# Plot res
par(mfrow = c(2, 2))
plot(res)
par(mfrow = c(1, 1))
# Import ggfortify and use autoplot()
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 3.2.5
autoplot(res, ncol=2)
# Inspect structure of Canada
library(vars); data(Canada)
## Warning: package 'vars' was built under R version 3.2.5
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked _by_ '.GlobalEnv':
##
## mammals
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.2.4
## Loading required package: sandwich
## Loading required package: urca
## Warning: package 'urca' was built under R version 3.2.5
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 3.2.5
str(Canada)
## mts [1:84, 1:4] 930 930 930 931 933 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:4] "e" "prod" "rw" "U"
## - attr(*, "tsp")= num [1:3] 1980 2001 4
## - attr(*, "class")= chr [1:2] "mts" "ts"
# Call plot() on Canada
plot(Canada)
# Call autoplot() on Canada
# autoplot(Canada)
# ggfortify and eurodist are available
# Autoplot + ggplot2 tweaking
# autoplot(eurodist) +
# labs(x="", y="") +
# coord_fixed() +
# theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
# Autoplot of MDS
# autoplot(cmdscale(eurodist, eig=TRUE), label=TRUE, label.size=3, size=0)
# perform clustering
iris_k <- kmeans(iris[-5], centers=3)
# autplot: coloring according to cluster
# autoplot(iris_k, data=iris, frame=TRUE)
# autoplot: coloring according to species
# autoplot(iris_k, data=iris, frame=TRUE, col="Species")
Choropleths are a series of polygons (or points or lines), and are useful when you have the shapes file and some underlying data. Maps (e.g., full US or by state or etc.) are a primary example - can put right in to ggplot2:
Cartographic maps are an alternative (topographical, photographic, etc.):
Animations can be useful for dense, temporal data or as an exploratory tool:
Example code includes:
library(ggplot2)
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.2.5
library(ggthemes)
library(maps)
## Warning: package 'maps' was built under R version 3.2.5
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(viridis)
# Basic map of the USA
usa <- ggplot2::map_data("usa")
str(usa)
## 'data.frame': 7243 obs. of 6 variables:
## $ long : num -101 -101 -101 -101 -101 ...
## $ lat : num 29.7 29.7 29.7 29.6 29.6 ...
## $ group : num 1 1 1 1 1 1 1 1 1 1 ...
## $ order : int 1 2 3 4 5 6 7 8 9 10 ...
## $ region : chr "main" "main" "main" "main" ...
## $ subregion: chr NA NA NA NA ...
ggplot(data=usa, aes(x=long, y=lat, group=group)) +
geom_polygon() +
coord_map()
# Add USA cities to the USA map (continental US only, cities with 250k+ population)
library(dplyr)
data(us.cities)
cities <- us.cities %>%
mutate(City=name, State=country.etc, Pop_est=pop) %>%
dplyr::select(City, State, Pop_est, lat, long) %>%
filter(!(State %in% c("AK", "HI", "ma")) & Pop_est >= 250000)
ggplot(usa, aes(x = long, y = lat, group = group)) +
geom_polygon() +
geom_point(data=cities, aes(group=State, size=Pop_est, col="red"), shape=16, alpha=0.6) +
coord_map() +
theme_map()
# Arrange cities
cities_arr <- arrange(cities, Pop_est)
# Create US plot of cities, colored by the viridis theme
ggplot(usa, aes(x = long, y = lat, group = group)) +
geom_polygon(fill="grey90") +
geom_point(data=cities_arr, aes(group=State, col=Pop_est), shape=16, size=2, alpha=0.6) +
coord_map() +
theme_map() +
scale_color_viridis()
# Create a dataset of populations by state
st_data <- "california ; texas ; florida ; new york ; illinois ; pennsylvania ; ohio ; georgia ; north carolina ; michigan ; new jersey ; virginia ; washington ; arizona ; massachusetts ; indiana ; tennessee ; missouri ; maryland ; wisconsin ; minnesota ; colorado ; south carolina ; alabama ; louisiana ; kentucky ; oregon ; oklahoma ; connecticut ; puerto rico ; iowa ; utah ; mississippi ; arkansas ; kansas ; nevada ; new mexico ; nebraska ; west virginia ; idaho ; hawaii ; new hampshire ; maine ; rhode island ; montana ; delaware ; south dakota ; north dakota ; alaska ; district of columbia ; vermont ; wyoming"
pop_data <- "39144818 ; 27469114 ; 20271272 ; 19795791 ; 12859995 ; 12802503 ; 11613423 ; 10214860 ; 10042802 ; 9922576 ; 8958013 ; 8382993 ; 7170351 ; 6828065 ; 6794422 ; 6619680 ; 6600299 ; 6083672 ; 6006401 ; 5771337 ; 5489594 ; 5456574 ; 4896146 ; 4858979 ; 4670724 ; 4425092 ; 4028977 ; 3911338 ; 3590886 ; 3474182 ; 3123899 ; 2995919 ; 2992333 ; 2978204 ; 2911641 ; 2890845 ; 2085109 ; 1896190 ; 1844128 ; 1654930 ; 1431603 ; 1330608 ; 1329328 ; 1056298 ; 1032949 ; 945934 ; 858469 ; 756927 ; 738432 ; 672228 ; 626042 ; 586107"
pop <- data.frame(region=strsplit(st_data, split=" ; ")[[1]],
Pop_est=as.numeric(strsplit(pop_data, split=" ; ")[[1]]),
stringsAsFactors = FALSE
)
# Map the basic state data
state <- map_data("state")
ggplot(data=state, aes(x=long, y=lat, group=group, fill=region)) +
geom_polygon(col="white") +
coord_map()
# Map the states by population
state2 <- merge(state, pop, by="region")
ggplot(data=state2, aes(x=long, y=lat, group=group, fill=Pop_est)) +
geom_polygon(col="white") +
coord_map() +
theme_map()
# Import shape information: germany (commented out since files not available)
library(rgdal)
## Warning: package 'rgdal' was built under R version 3.2.5
## Loading required package: sp
## Warning: package 'sp' was built under R version 3.2.5
## rgdal: version: 1.2-3, (SVN revision 639)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 2.0.1, released 2015/09/15
## Path to GDAL shared files: C:/Users/Dave/Documents/R/win-library/3.2/rgdal/gdal
## GDAL does not use iconv for recoding strings.
## Loaded PROJ.4 runtime: Rel. 4.9.1, 04 March 2015, [PJ_VERSION: 491]
## Path to PROJ.4 shared files: C:/Users/Dave/Documents/R/win-library/3.2/rgdal/proj
## Linking to sp version: 1.2-3
# germany <- readOGR(dsn="shapes", "DEU_adm1")
# bundes <- fortify(germany)
#
# ggplot(data=bundes, aes(x=long, y=lat, group=group)) +
# geom_polygon(fill="blue", col="white") +
# coord_map() +
# theme_nothing()
#
# bundes$state <- factor(as.numeric(bundes$id))
# levels(bundes$state) <- germany$NAME_1
#
# bundes_unemp <- merge(bundes, unemp, by="state")
#
# ggplot(bundes_unemp, aes(x = long, y = lat, group = group, fill=unemployment)) +
# geom_polygon() +
# coord_map() +
# theme_map()
# Create the map of London
library(ggmap)
london_map_13 <- get_map("London, England", zoom=13)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=London,+England&zoom=13&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=London,%20England&sensor=false
ggmap(london_map_13)
# Experiment with get_map() and use ggmap() to plot it!
# temp1 <- get_map("London, England", zoom=13, maptype="toner", source="stamen")
# ggmap(temp1)
temp2 <- get_map("London, England", zoom=13, maptype="hybrid")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=London,+England&zoom=13&size=640x640&scale=2&maptype=hybrid&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=London,%20England&sensor=false
ggmap(temp2)
# Map some key sites in London
london_sites <- strsplit("Tower of London, London ; Buckingham Palace, London ; Tower Bridge, London ; Queen Elizabeth Olympic Park, London", " ; ")[[1]]
xx <- geocode(london_sites)
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Tower%20of%20London,%20London&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Buckingham%20Palace,%20London&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Tower%20Bridge,%20London&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Queen%20Elizabeth%20Olympic%20Park,%20London&sensor=false
xx$location <- sub(", London", "", london_sites)
london_ton_13 <- get_map(location = "London, England", zoom = 13,
source = "stamen", maptype = "toner")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=London,+England&zoom=13&size=640x640&scale=2&maptype=terrain&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=London,%20England&sensor=false
## Map from URL : http://tile.stamen.com/toner/13/4091/2722.png
## Map from URL : http://tile.stamen.com/toner/13/4092/2722.png
## Map from URL : http://tile.stamen.com/toner/13/4093/2722.png
## Map from URL : http://tile.stamen.com/toner/13/4094/2722.png
## Map from URL : http://tile.stamen.com/toner/13/4091/2723.png
## Map from URL : http://tile.stamen.com/toner/13/4092/2723.png
## Map from URL : http://tile.stamen.com/toner/13/4093/2723.png
## Map from URL : http://tile.stamen.com/toner/13/4094/2723.png
## Map from URL : http://tile.stamen.com/toner/13/4091/2724.png
## Map from URL : http://tile.stamen.com/toner/13/4092/2724.png
## Map from URL : http://tile.stamen.com/toner/13/4093/2724.png
## Map from URL : http://tile.stamen.com/toner/13/4094/2724.png
## Map from URL : http://tile.stamen.com/toner/13/4091/2725.png
## Map from URL : http://tile.stamen.com/toner/13/4092/2725.png
## Map from URL : http://tile.stamen.com/toner/13/4093/2725.png
## Map from URL : http://tile.stamen.com/toner/13/4094/2725.png
# Add a geom_points layer
ggmap(london_ton_13) +
geom_point(data=xx, aes(col=location), size=6)
## Warning: Removed 1 rows containing missing values (geom_point).
# Expand to use the bounding box
xx <- geocode(london_sites)
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Tower%20of%20London,%20London&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Buckingham%20Palace,%20London&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Tower%20Bridge,%20London&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Queen%20Elizabeth%20Olympic%20Park,%20London&sensor=false
xx$location <- sub(", London", "", london_sites)
xx$location[4] <- "Queen Elizabeth\nOlympic Park"
# Create bounding box: bbox
bbox <- make_bbox(lon = xx$lon, lat = xx$lat, f = 0.3)
london_ton_13 <- get_map(bbox, zoom = 13,
source = "stamen", maptype = "toner"
)
## Map from URL : http://tile.stamen.com/toner/13/4091/2722.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'a991dd39e80eba942f916d1a39eacba1.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner/13/4092/2722.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'ca4d532407f75fb40f356d82e9f3e868.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner/13/4093/2722.png
## Warning in file.remove(index[[url]]): cannot remove file
## '2d23913e51259d39586aa2f72cf7262a.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner/13/4094/2722.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'd95217086725a265fa36c032fbc90ad6.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner/13/4095/2722.png
## Map from URL : http://tile.stamen.com/toner/13/4096/2722.png
## Map from URL : http://tile.stamen.com/toner/13/4091/2723.png
## Warning in file.remove(index[[url]]): cannot remove file
## '34870d377d252bc5a4f1705aaec727a7.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner/13/4092/2723.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'afecef47c6d1c2bb613ae817f73ef94e.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner/13/4093/2723.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'f0de52ccfe7b807ab9e08f76c37ff7be.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner/13/4094/2723.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'f7fedbdcd7d7ca7fa3714dc4a4b6b802.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner/13/4095/2723.png
## Map from URL : http://tile.stamen.com/toner/13/4096/2723.png
## Map from URL : http://tile.stamen.com/toner/13/4091/2724.png
## Warning in file.remove(index[[url]]): cannot remove file
## '8bebd0980b74870f3d7dc958d655198f.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner/13/4092/2724.png
## Warning in file.remove(index[[url]]): cannot remove file
## '245dcf359e94685490a556539f0ef26f.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner/13/4093/2724.png
## Warning in file.remove(index[[url]]): cannot remove file
## '92a8bede90fdcd1ba21c8c78c10f06b3.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner/13/4094/2724.png
## Warning in file.remove(index[[url]]): cannot remove file
## '9acc9d919db20a0d68fdef3dd4092cfe.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner/13/4095/2724.png
## Map from URL : http://tile.stamen.com/toner/13/4096/2724.png
# Map from previous exercise
ggmap(london_ton_13) +
geom_point(data = xx, aes(col = location), size = 6)
# New map with labels
ggmap(london_ton_13) +
geom_label(data=xx, aes(label=location), size=4, fontface="bold", fill="grey90", col="#E41A1C")
# Get the map data of "Germany" and Plot map and polygon on top: (not displayed)
# germany_06 <- get_map("Germany", zoom=6)
# ggmap(germany_06) +
# geom_polygon(data=bundes, aes(x=long, y=lat, group=group), fill=NA, col="red") +
# coord_map()
# Animated Japan map (not shown)
# str(japan)
#
# saveGIF({
#
# for (i in unique(japan$time)) {
#
# data <- japan[japan$time == i, ]
#
# p <- ggplot(data, aes(x = AGE, y = POP, fill = SEX, width = 1)) +
# coord_flip() +
# geom_bar(data = data[data$SEX == "Female",], stat = "identity") +
# geom_bar(data = data[data$SEX == "Male",], stat = "identity") +
# ggtitle(i)
#
# print(p)
#
# }
#
# }, movie.name = "pyramid.gif", interval = 0.1)
# Animate the vocabularies by year
# library(gganimate)
# library(car)
# data(Vocab)
# p <- ggplot(Vocab, aes(x = education, y = vocabulary,
# color = year, group = year,
# frame=year, cumulative=TRUE
# )
# ) +
# stat_smooth(method = "lm", se = FALSE, size = 3)
#
# gg_animate(p, filename="vocab.gif", interval = 1.0)
The ggplot2 internals include 2 plotting systems in R (base and grid):
There are many possible customizations to the grid process:
The ggplot2 package is built using the grid() framework:
Any time you create a ggplot (even if empty - ggplot()), you create an object:
The gridExtra library allows for additional customization:
You can write your own extensions in ggplot2 2.0 (e.g., the “bag plot” based on Tukey):
Case Study - weather, using NYC data and several Tukey-like graphs:
Case Study Part II - same idea, but increasingly efficient by way of several stat functions in ggplot2:
Example code includes:
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:robustbase':
##
## heart
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:GSIF':
##
## describe
## The following objects are masked from 'package:dplyr':
##
## combine, src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(grid)
library(gtable)
## Warning: package 'gtable' was built under R version 3.2.5
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following objects are masked from 'package:ggtern':
##
## arrangeGrob, grid.arrange
library(dplyr)
detach(package:dplyr)
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:Hmisc':
##
## is.discrete, summarize
## The following object is masked from 'package:maps':
##
## ozone
## The following object is masked from 'package:purrr':
##
## compact
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:Hmisc':
##
## combine, src, summarize
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:GGally':
##
## nasa
## The following objects are masked from 'package:xts':
##
## first, last
## The following objects are masked from 'package:data.table':
##
## between, last
## The following object is masked from 'package:purrr':
##
## order_by
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(aplpack)
## Warning: package 'aplpack' was built under R version 3.2.5
## Loading required package: tcltk
# Draw rectangle in null viewport
grid.rect(gp=gpar(fill = "grey90"))
# Write text in null viewport
grid.text("null viewport")
# Draw a line
grid.lines(x=c(0, 0.75), y=c(0.25, 1), gp=gpar(lty=2, col="red"))
# Populate null viewport
grid.rect(gp = gpar(fill = "grey90"))
grid.text("null viewport")
grid.lines(x = c(0,0.75), y = c(0.25, 1),
gp = gpar(lty = 2, col = "red"))
# Create new viewport: vp
vp <- viewport(x=0.5, y=0.5, width=0.5, height=0.5, just="center")
# Push vp
pushViewport(vp)
# Populate new viewport with rectangle
grid.rect(gp=gpar(fill="blue"))
# Create plot viewport: pvp
mar <- c(5, 4, 2, 2)
pvp <- plotViewport(mar)
# Push pvp
pushViewport(pvp)
# Add rectangle
grid.rect(gp=gpar(fill="grey80"))
# Create data viewport: dvp
dvp <- dataViewport(mtcars$wt, mtcars$mpg)
# Push dvp
pushViewport(dvp)
# Add two axes
grid.xaxis()
grid.yaxis()
# Work from before
pushViewport(plotViewport(c(5, 4, 2, 2)))
grid.rect(gp = gpar())
pushViewport(dataViewport(xData = mtcars$wt, yData = mtcars$mpg))
grid.xaxis()
grid.yaxis()
# Add text to x axis
grid.text(label="Weight", y=unit(-3, "lines"))
# Add text to y axis
grid.text(label="MPG", x=unit(-3, "lines"), rot=90)
# Add points
grid.points(x=mtcars$wt, y=mtcars$mpg, pch=16)
# Work from before
pushViewport(plotViewport(c(5, 4, 2, 2)))
grid.rect(gp = gpar())
pushViewport(dataViewport(xData = mtcars$wt, yData = mtcars$mpg))
grid.xaxis()
grid.yaxis()
# Work from before - add names
grid.text("Weight", y = unit(-3, "lines"), name = "xaxis")
grid.text("MPG", x = unit(-3, "lines"), rot = 90, name = "yaxis")
grid.points(x = mtcars$wt, y = mtcars$mpg, pch = 16, name = "datapoints")
# Edit "xaxis"
grid.edit("xaxis", label="Miles/(US) gallon")
# Edit "yaxis"
grid.edit("yaxis", label="Weight (1000 lbs)")
# Edit "datapoints"
grid.edit("datapoints", gp=gpar(col="#C3212766", cex=2))
# A simple plot p
p <- ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) + geom_point()
# Create gtab with ggplotGrob()
gtab <- ggplotGrob(p)
# Print out gtab
gtab
## TableGrob (10 x 9) "layout": 18 grobs
## z cells name grob
## 1 0 ( 1-10, 1- 9) background rect[plot.background..rect.16221]
## 2 5 ( 5- 5, 3- 3) spacer zeroGrob[NULL]
## 3 7 ( 6- 6, 3- 3) axis-l absoluteGrob[GRID.absoluteGrob.16197]
## 4 3 ( 7- 7, 3- 3) spacer zeroGrob[NULL]
## 5 6 ( 5- 5, 4- 4) axis-t zeroGrob[NULL]
## 6 1 ( 6- 6, 4- 4) panel gTree[panel-1.gTree.16177]
## 7 9 ( 7- 7, 4- 4) axis-b absoluteGrob[GRID.absoluteGrob.16190]
## 8 4 ( 5- 5, 5- 5) spacer zeroGrob[NULL]
## 9 8 ( 6- 6, 5- 5) axis-r zeroGrob[NULL]
## 10 2 ( 7- 7, 5- 5) spacer zeroGrob[NULL]
## 11 10 ( 4- 4, 4- 4) xlab-t zeroGrob[NULL]
## 12 11 ( 8- 8, 4- 4) xlab-b titleGrob[axis.title.x..titleGrob.16180]
## 13 12 ( 6- 6, 2- 2) ylab-l titleGrob[axis.title.y..titleGrob.16183]
## 14 13 ( 6- 6, 6- 6) ylab-r zeroGrob[NULL]
## 15 14 ( 6- 6, 8- 8) guide-box gtable[guide-box]
## 16 15 ( 3- 3, 4- 4) subtitle zeroGrob[plot.subtitle..zeroGrob.16218]
## 17 16 ( 2- 2, 4- 4) title zeroGrob[plot.title..zeroGrob.16217]
## 18 17 ( 9- 9, 4- 4) caption zeroGrob[plot.caption..zeroGrob.16219]
# Extract the grobs from gtab: gtab
g <- gtab$grobs
# Draw only the legend
grid.draw(g[[15]])
# Code from before
p <- ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) + geom_point()
gtab <- ggplotGrob(p)
g <- gtab$grobs
grid.draw(g[[15]])
# Show layout of g[[15]]
gtable_show_layout(g[[15]])
# Create text grob
my_text <- textGrob(label = "Motor Trend, 1974", gp = gpar(fontsize = 7, col = "gray25"))
# Use gtable_add_grob to modify original gtab
new_legend <- gtable_add_grob(gtab$grobs[[15]], my_text, 3, 2)
# Update in gtab
gtab$grobs[[15]] <- new_legend
# Draw gtab
grid.draw(gtab)
# Simple plot p
p <- ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) + geom_point()
# Examine class() and names()
class(p)
## [1] "gg" "ggplot"
names(p)
## [1] "data" "layers" "scales" "mapping" "theme"
## [6] "coordinates" "facet" "plot_env" "labels"
# Print the scales sub-list
p$scales$scales
## list()
# Update p
p <- p +
scale_x_continuous("Length", limits = c(4, 8), expand = c(0, 0)) +
scale_y_continuous("Width", limits = c(2, 4.5), expand = c(0, 0))
# Print the scales sub-list
p$scales$scales
## [[1]]
## <ScaleContinuousPosition>
## Range:
## Limits: 4 -- 8
##
## [[2]]
## <ScaleContinuousPosition>
## Range:
## Limits: 2 -- 4.5
# Box plot of mtcars: p
p <- ggplot(mtcars, aes(x = factor(cyl), y = wt)) + geom_boxplot()
# Create pbuild
pbuild <- ggplot_build(p)
# a list of 3 elements
names(pbuild)
## [1] "data" "layout" "plot"
# Print out each element in pbuild
pbuild$data
## [[1]]
## ymin lower middle upper ymax outliers notchupper
## 1 1.513 1.8850 2.200 2.62250 3.19 2.551336
## 2 2.620 2.8225 3.215 3.44000 3.46 3.583761
## 3 3.170 3.5325 3.755 4.01375 4.07 5.250, 5.424, 5.345 3.958219
## notchlower x PANEL group ymin_final ymax_final xmin xmax weight colour
## 1 1.848664 1 1 1 1.513 3.190 0.625 1.375 1 grey20
## 2 2.846239 2 1 2 2.620 3.460 1.625 2.375 1 grey20
## 3 3.551781 3 1 3 3.170 5.424 2.625 3.375 1 grey20
## fill size alpha shape linetype
## 1 white 0.5 NA 19 solid
## 2 white 0.5 NA 19 solid
## 3 white 0.5 NA 19 solid
pbuild$layout
## <ggproto object: Class Layout>
## facet: <ggproto object: Class FacetNull, Facet>
## compute_layout: function
## draw_back: function
## draw_front: function
## draw_labels: function
## draw_panels: function
## finish_data: function
## init_scales: function
## map: function
## map_data: function
## params: list
## render_back: function
## render_front: function
## render_panels: function
## setup_data: function
## setup_params: function
## shrink: TRUE
## train: function
## train_positions: function
## train_scales: function
## vars: function
## super: <ggproto object: Class FacetNull, Facet>
## finish_data: function
## get_scales: function
## map: function
## map_position: function
## panel_layout: data.frame
## panel_ranges: list
## panel_scales: list
## render: function
## render_labels: function
## reset_scales: function
## setup: function
## train_position: function
## train_ranges: function
## xlabel: function
## ylabel: function
## super: <ggproto object: Class Layout>
pbuild$plot
# Create gtab from pbuild
gtab <- ggplot_gtable(pbuild)
# Draw gtab
grid.draw(gtab)
# Box plot of mtcars: p
p <- ggplot(mtcars, aes(x = factor(cyl), y = wt)) + geom_boxplot()
# Build pdata
pdata <- ggplot_build(p)$data
# Access the first element of the list, a data frame
class(pdata[[1]])
## [1] "data.frame"
# Isolate this data frame
my_df <- pdata[[1]]
# The x labels
my_df$group <- c("4", "6", "8")
# Print out specific variables
my_df[c(1:6, 11)]
## ymin lower middle upper ymax outliers group
## 1 1.513 1.8850 2.200 2.62250 3.19 4
## 2 2.620 2.8225 3.215 3.44000 3.46 6
## 3 3.170 3.5325 3.755 4.01375 4.07 5.250, 5.424, 5.345 8
# Add a theme (legend at the bottom)
g1 <- ggplot(mtcars, aes(wt, mpg, col = cyl)) +
geom_point(alpha = 0.5) +
theme(legend.position="bottom")
# Add a theme (no legend)
g2 <- ggplot(mtcars, aes(disp, fill = cyl)) +
geom_histogram(position = "identity", alpha = 0.5, binwidth = 20) +
theme(legend.position="none")
# Load gridExtra
library(gridExtra)
# Call grid.arrange()
grid.arrange(g1, g2, ncol=2)
# Definitions of g1 and g2
g1 <- ggplot(mtcars, aes(wt, mpg, col = cyl)) +
geom_point() +
theme(legend.position = "bottom")
g2 <- ggplot(mtcars, aes(disp, fill = cyl)) +
geom_histogram(binwidth = 20) +
theme(legend.position = "none")
# Extract the legend from g1
my_legend <- ggplotGrob(g1)$grobs[[15]]
# Create g1_noleg
g1_noleg <- ggplot(mtcars, aes(wt, mpg, col = cyl)) +
geom_point() +
theme(legend.position = "none")
# Calculate the height: legend_height
legend_height <- sum(my_legend$heights)
# Arrange g1_noleg, g2 and my_legend
grid.arrange(g1_noleg, g2, my_legend,
layout_matrix=matrix(c(1, 3, 2, 3), ncol=2),
heights=unit.c(unit(1, "npc") - legend_height, legend_height)
)
Continuing with:
test_data <- data.frame(x=rep(0, 60), y=rep(0, 60))
test_data$x <- as.numeric(strsplit("2560 ; 2345 ; 1845 ; 2260 ; 2440 ; 2285 ; 2275 ; 2350 ; 2295 ; 1900 ; 2390 ; 2075 ; 2330 ; 3320 ; 2885 ; 3310 ; 2695 ; 2170 ; 2710 ; 2775 ; 2840 ; 2485 ; 2670 ; 2640 ; 2655 ; 3065 ; 2750 ; 2920 ; 2780 ; 2745 ; 3110 ; 2920 ; 2645 ; 2575 ; 2935 ; 2920 ; 2985 ; 3265 ; 2880 ; 2975 ; 3450 ; 3145 ; 3190 ; 3610 ; 2885 ; 3480 ; 3200 ; 2765 ; 3220 ; 3480 ; 3325 ; 3855 ; 3850 ; 3195 ; 3735 ; 3665 ; 3735 ; 3415 ; 3185 ; 3690", " ; ")[[1]])
test_data$y <- as.numeric(strsplit("97 ; 114 ; 81 ; 91 ; 113 ; 97 ; 97 ; 98 ; 109 ; 73 ; 97 ; 89 ; 109 ; 305 ; 153 ; 302 ; 133 ; 97 ; 125 ; 146 ; 107 ; 109 ; 121 ; 151 ; 133 ; 181 ; 141 ; 132 ; 133 ; 122 ; 181 ; 146 ; 151 ; 116 ; 135 ; 122 ; 141 ; 163 ; 151 ; 153 ; 202 ; 180 ; 182 ; 232 ; 143 ; 180 ; 180 ; 151 ; 189 ; 180 ; 231 ; 305 ; 302 ; 151 ; 202 ; 182 ; 181 ; 143 ; 146 ; 146", " ; ")[[1]])
# Call bagplot() on test_data
bagplot(test_data)
# Call compute.bagplot on test_data: bag
bag <- compute.bagplot(test_data)
# Highlight components
points(bag$hull.loop, col = "green", pch = 16)
points(bag$hull.bag, col = "orange", pch = 16)
points(bag$pxy.outlier, col="purple", pch=16)
# Create data frames from matrices
hull.loop <- data.frame(x = bag$hull.loop[,1], y = bag$hull.loop[,2])
hull.bag <- data.frame(x = bag$hull.bag[,1], y = bag$hull.bag[,2])
pxy.outlier <- data.frame(x = bag$pxy.outlier[, 1], y=bag$pxy.outlier[, 2])
# Finish the ggplot command
ggplot(test_data, aes(x=x, y=y)) +
geom_polygon(data = hull.loop, fill = "green") +
geom_polygon(data = hull.bag, fill = "orange") +
geom_point(data=pxy.outlier, col="purple", pch=16, cex=1.5)
# ggproto for StatLoop (hull.loop)
StatLoop <- ggproto("StatLoop", Stat,
required_aes = c("x", "y"),
compute_group = function(data, scales) {
bag <- compute.bagplot(x = data$x, y = data$y)
data.frame(x = bag$hull.loop[,1], y = bag$hull.loop[,2])
})
# ggproto for StatBag (hull.bag)
StatBag <- ggproto("StatBag", Stat,
required_aes = c("x", "y"),
compute_group = function(data, scales) {
bag <- compute.bagplot(x = data$x, y = data$y)
data.frame(x = bag$hull.bag[, 1], y = bag$hull.bag[, 2])
})
# ggproto for StatOut (pxy.outlier)
StatOut <- ggproto("StatOut", Stat,
required_aes = c("x", "y"),
compute_group = function(data, scales) {
bag <- compute.bagplot(x = data$x, y = data$y)
data.frame(x = bag$pxy.outlier[, 1], y = bag$pxy.outlier[, 2])
})
# Combine ggproto objects in layers to build stat_bag()
stat_bag <- function(mapping = NULL, data = NULL, geom = "polygon",
position = "identity", na.rm = FALSE, show.legend = NA,
inherit.aes = TRUE, loop = FALSE, ...) {
list(
# StatLoop layer
layer(
stat = StatLoop, data = data, mapping = mapping, geom = geom,
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(na.rm = na.rm, alpha=0.35, col=NA, ...)
),
# StatBag layer
layer(
stat = StatBag, data = data, mapping = mapping, geom = geom,
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(na.rm = na.rm, alpha=0.35, col=NA, ...)
),
# StatOut layer
layer(
stat = StatOut, data = data, mapping = mapping, geom = "point",
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(na.rm = na.rm, alpha = 0.7, col = NA, shape = 21, ...)
)
)
}
# Previous method
ggplot(test_data, aes(x = x, y = y)) +
geom_polygon(data = hull.loop, fill = "green") +
geom_polygon(data = hull.bag, fill = "orange") +
geom_point(data = pxy.outlier, col = "purple", pch = 16, cex = 1.5)
# stat_bag
ggplot(test_data, aes(x = x, y = y)) +
stat_bag(fill = "black")
## Warning: Removed 4 rows containing missing values (geom_point).
# stat_bag on test_data2
# Would require brining over test_data2 (point is that it is now transferrable)
# ggplot(test_data2, aes(x = x, y = y, fill=treatment)) +
# stat_bag()
# Data set NYNEWYOR.txt downloaded from US Cities/New York/NYC available at:
# http://academic.udayton.edu/kissock/http/Weather/
# Import weather data
weather <- read.fwf("NYNEWYOR.txt",
header = FALSE,
col.names = c("month", "day", "year", "temp"),
widths = c(14, 14, 13, 4)
)
# Check structure of weather
str(weather)
## 'data.frame': 8009 obs. of 4 variables:
## $ month: num 1 1 1 1 1 1 1 1 1 1 ...
## $ day : num 1 2 3 4 5 6 7 8 9 10 ...
## $ year : num 1995 1995 1995 1995 1995 ...
## $ temp : num 44 41 28 31 21 27 42 35 34 29 ...
# Create past with two filter() calls
past <- weather %>%
filter(month != 2 | day != 29) %>%
filter(year != max(weather$year))
# Check structure of past
str(past)
## 'data.frame': 7665 obs. of 4 variables:
## $ month: num 1 1 1 1 1 1 1 1 1 1 ...
## $ day : num 1 2 3 4 5 6 7 8 9 10 ...
## $ year : num 1995 1995 1995 1995 1995 ...
## $ temp : num 44 41 28 31 21 27 42 35 34 29 ...
# Create new version of past
past_summ <- past %>%
group_by(year) %>%
mutate(yearday=1:length(day)) %>%
ungroup() %>%
filter(temp != -99) %>%
group_by(yearday) %>%
mutate(max = max(temp),
min = min(temp),
avg = mean(temp),
CI_lower = Hmisc::smean.cl.normal(temp)[2],
CI_upper = Hmisc::smean.cl.normal(temp)[3]
) %>%
ungroup()
# Structure of past_summ
str(past_summ)
## Classes 'tbl_df', 'tbl' and 'data.frame': 7645 obs. of 10 variables:
## $ month : num 1 1 1 1 1 1 1 1 1 1 ...
## $ day : num 1 2 3 4 5 6 7 8 9 10 ...
## $ year : num 1995 1995 1995 1995 1995 ...
## $ temp : num 44 41 28 31 21 27 42 35 34 29 ...
## $ yearday : int 1 2 3 4 5 6 7 8 9 10 ...
## $ max : num 51 48 57 55 56 62 52 57 54 47 ...
## $ min : num 17 15 16 15 21 14 14 12 21 8.5 ...
## $ avg : num 35.6 35.4 34.9 35.1 35.9 ...
## $ CI_lower: num 31 31.6 29.7 29.9 31.9 ...
## $ CI_upper: num 40.1 39.2 40 40.4 39.9 ...
# Adapt historical plot
ggplot(past_summ, aes(x = yearday, y = temp)) +
geom_point(col = "#EED8AE", alpha = 0.3, shape=16) +
geom_linerange(aes(ymin = CI_lower, ymax = CI_upper), col = "#8B7E66")
# Create present
present <- weather %>%
filter(!(month == 2 & day == 29)) %>%
filter(year == max(year)) %>%
group_by(year) %>%
mutate(yearday = 1:length(day)) %>%
ungroup() %>%
filter(temp != -99)
# Add geom_line to ggplot command
ggplot(past_summ, aes(x = yearday, y = temp)) +
geom_point(col = "#EED8AE", alpha = 0.3, shape = 16) +
geom_linerange(aes(ymin = CI_lower, ymax = CI_upper), col = "#8B7E66") +
geom_line(data=present, aes(x = yearday, y=temp))
# Create past_highs
past_highs <- past_summ %>%
group_by(yearday) %>%
summarise(past_high = max(temp))
# Create record_high
record_high <- present %>%
left_join(past_highs, by="yearday") %>%
filter(temp > past_high)
# Add record_high information to plot
ggplot(past_summ, aes(x = yearday, y = temp)) +
geom_point(col = "#EED8AE", alpha = 0.3, shape = 16) +
geom_linerange(aes(ymin = CI_lower, ymax = CI_upper), col = "#8B7E66") +
geom_line(data = present) +
geom_point(data=record_high, col = "#CD2626")
# Create past_extremes
past_extremes <- past_summ %>%
group_by(yearday) %>%
summarise(past_low = min(temp),
past_high = max(temp)
)
# Create record_high_low
record_high_low <- present %>%
left_join(past_extremes, by="yearday") %>%
mutate(record = ifelse(temp < past_low,
"#0000CD",
ifelse(temp > past_high,
"#CD2626",
"#00000000")
)
)
# Structure of record_high_low
str(record_high_low)
## Classes 'tbl_df', 'tbl' and 'data.frame': 338 obs. of 8 variables:
## $ month : num 1 1 1 1 1 1 1 1 1 1 ...
## $ day : num 1 2 3 4 5 6 7 8 9 10 ...
## $ year : num 2016 2016 2016 2016 2016 ...
## $ temp : num 41 37 40 33 19 32 39 40 43 50 ...
## $ yearday : int 1 2 3 4 5 6 7 8 9 10 ...
## $ past_low : num 17 15 16 15 21 14 14 12 21 8.5 ...
## $ past_high: num 51 48 57 55 56 62 52 57 54 47 ...
## $ record : chr "#00000000" "#00000000" "#00000000" "#00000000" ...
# Add point layer of record_high_low
ggplot(past_summ, aes(x = yearday, y = temp)) +
geom_point(col = "#EED8AE", alpha = 0.3, shape = 16) +
geom_linerange(aes(ymin = CI_lower, ymax = CI_upper), col = "#8B7E66") +
geom_line(data = present) +
geom_point(data=record_high_low, aes(col=record)) +
scale_color_identity()
# Finish the function draw_pop_legend
draw_pop_legend <- function(x = 0.6, y = 0.2, width = 0.2, height = 0.2, fontsize = 10) {
# Finish viewport() function
pushViewport(viewport(x = x, y = y, width = width, height = height, just = "center"))
legend_labels <- c("Past record high",
"95% CI range",
"Current year",
"Past years",
"Past record low")
legend_position <- c(0.9, 0.7, 0.5, 0.2, 0.1)
# Finish grid.text() function
grid.text(legend_labels, x = 0.12, y = legend_position,
just = "left", gp = gpar(fontsize = fontsize, col = "grey20"))
# Position dots, rectangle and line
point_position_y <- c(0.1, 0.2, 0.9)
point_position_x <- rep(0.06, length(point_position_y))
grid.points(x = point_position_x, y = point_position_y, pch = 16,
gp = gpar(col = c("#0000CD", "#EED8AE", "#CD2626")))
grid.rect(x = 0.06, y = 0.5, width = 0.06, height = 0.4,
gp = gpar(col = NA, fill = "#8B7E66"))
grid.lines(x = c(0.03, 0.09), y = c(0.5, 0.5),
gp = gpar(col = "black", lwd = 3))
# Add popViewport() for bookkeeping
popViewport()
}
# Call draw_pop_legend()
draw_pop_legend()
# Finish the clean_weather function
clean_weather <- function(file) {
weather <- read.fwf(file,
header=FALSE,
col.names = c("month", "day", "year", "temp"),
widths = c(14, 14, 13, 4))
weather %>%
filter(month != 2 | day != 29) %>%
group_by(year) %>%
mutate(yearday = 1:length(day)) %>%
ungroup() %>%
filter(temp != -99)
}
# Import NYNEWYOR.txt: my_data
my_data <- clean_weather(file="NYNEWYOR.txt")
# Create the stats object
StatHistorical <- ggproto("StatHistorical", Stat,
compute_group = function(data, scales, params) {
data <- data %>%
filter(year != max(year)) %>%
group_by(x) %>%
mutate(ymin = Hmisc::smean.cl.normal(y)[3],
ymax = Hmisc::smean.cl.normal(y)[2]) %>%
ungroup()
},
required_aes = c("x", "y", "year"))
# Create the layer
stat_historical <- function(mapping = NULL, data = NULL, geom = "point",
position = "identity", na.rm = FALSE, show.legend = NA,
inherit.aes = TRUE, ...) {
list(
layer(
stat = "identity", data = data, mapping = mapping, geom = geom,
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(na.rm = na.rm, col = "#EED8AE", alpha = 0.3, shape = 16, ...)
),
layer(
stat = StatHistorical, data = data, mapping = mapping, geom = "linerange",
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(na.rm = na.rm, col = "#8B7E66", ...)
)
)
}
# Build the plot
my_data <- clean_weather("NYNEWYOR.txt")
ggplot(my_data, aes(x = yearday, y = temp, year = year)) +
stat_historical()
# Create the stats object
StatPresent <- ggproto("StatPresent", Stat,
compute_group = function(data, scales, params) {
data <- filter(data, year == max(year))
},
required_aes = c("x", "y", "year")
)
# Create the layer
stat_present <- function(mapping = NULL, data = NULL, geom = "line",
position = "identity", na.rm = FALSE, show.legend = NA,
inherit.aes = TRUE, ...) {
layer(
stat = StatPresent, data = data, mapping = mapping, geom = geom,
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(na.rm = na.rm, ...)
)
}
# Build the plot
my_data <- clean_weather("NYNEWYOR.txt")
ggplot(my_data, aes(x = yearday, y = temp, year = year)) +
stat_historical() +
stat_present()
# Create the stats object
StatExtremes <- ggproto("StatExtremes", Stat,
compute_group = function(data, scales, params) {
present <- data %>%
filter(year == max(year))
past <- data %>%
filter(year != max(year))
past_extremes <- past %>%
group_by(x) %>%
summarise(past_low = min(y),
past_high = max(y))
# transform data to contain extremes
data <- present %>%
left_join(past_extremes, by="x") %>%
mutate(record = ifelse(y < past_low,
"#0000CD",
ifelse(y > past_high,
"#CD2626",
"#00000000"
)
)
)
},
required_aes = c("x", "y", "year"))
# Create the layer
stat_extremes <- function(mapping = NULL, data = NULL, geom = "point",
position = "identity", na.rm = FALSE, show.legend = NA,
inherit.aes = TRUE, ...) {
layer(
stat = StatExtremes, data = data, mapping = mapping, geom = geom,
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(na.rm = na.rm, ...)
)
}
# Build the plot
my_data <- clean_weather("NYNEWYOR.txt")
ggplot(my_data, aes(x = yearday, y = temp, year = year)) +
stat_historical() +
stat_present() +
stat_extremes(aes(col=..record..)) +
scale_color_identity()
# Data sets FRPARIS.txt, ILREYKJV.txt, UKLONDON.txt downloaded from International Cities at:
# http://academic.udayton.edu/kissock/http/Weather/citylistWorld.htm
# File paths of all datasets
my_files <- c("NYNEWYOR.txt","FRPARIS.txt", "ILREYKJV.txt", "UKLONDON.txt")
# Build my_data with a for loop
my_data <- NULL
for (file in my_files) {
temp <- clean_weather(file)
temp$id <- sub(".txt", "", file)
my_data <- rbind(my_data, temp)
}
# Build the final plot, from scratch!
ggplot(data=my_data, aes(x=yearday, y=temp, year=year)) +
stat_historical() +
stat_present() +
stat_extremes(aes(col=..record..)) +
scale_color_identity() +
facet_wrap(~id, ncol=2)
The ggvis is based on a “grammar of graphics” and is closely linked to ggplot2 (both designed by Wickham). The objective for ggvis is to combine the analytic power of R with the visual power of Javascript.
Broadly, the “grammar of graphics” includes several layers such as Data, Coordinate System, Marks, Properties, and the like. This is similar to ggplot2 though with a modified syntax that is more in line with dplyr chaining:
myData %>% ggvis(~myX, ~myY, fill = ~myFill, …) %>% layer_myMarkChoice()
Note that the := operator is the static assignment operator, ensuring that a call to “red” means the color “red” and not merely a character vector coerced to the required length with every entry being “red”. The ~ symbolizes that this is a variable in my dataset. So ~red would mean the variable red in the dataset undergoing plotting.
Some basic example code includes:
library(ggvis)
## Warning: package 'ggvis' was built under R version 3.2.5
##
## Attaching package: 'ggvis'
## The following object is masked from 'package:ggplot2':
##
## resolution
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# Change the code below to make a graph with red points
mtcars %>% ggvis(~wt, ~mpg, fill := "red") %>% layer_points()
# Change the code below draw smooths instead of points
mtcars %>% ggvis(~wt, ~mpg) %>% layer_smooths()
# Change the code below to make a graph containing both points and a smoothed summary line
mtcars %>% ggvis(~wt, ~mpg) %>% layer_points() %>% layer_smooths()
data(pressure)
str(pressure)
## 'data.frame': 19 obs. of 2 variables:
## $ temperature: num 0 20 40 60 80 100 120 140 160 180 ...
## $ pressure : num 0.0002 0.0012 0.006 0.03 0.09 0.27 0.75 1.85 4.2 8.8 ...
# Adapt the code: show bars instead of points
pressure %>% ggvis(~temperature, ~pressure) %>% layer_bars()
# Adapt the codee: show lines instead of points
pressure %>% ggvis(~temperature, ~pressure) %>% layer_lines()
# Extend the code: map the fill property to the temperature variable
pressure %>% ggvis(~temperature, ~pressure, fill=~temperature) %>% layer_points()
# Extend the code: map the size property to the pressure variable
pressure %>% ggvis(~temperature, ~pressure, size=~pressure) %>% layer_points()
There are three main new operators in ggvis (relative to ggplot):
The line is a special type of mark (second most common after points) - stroke, strokeWidth, strokeOpacity, strokeDash, fill, fillOpacity
Other forms include:
Example code includes:
data(faithful)
str(faithful)
## 'data.frame': 272 obs. of 2 variables:
## $ eruptions: num 3.6 1.8 3.33 2.28 4.53 ...
## $ waiting : num 79 54 74 62 85 55 88 85 51 85 ...
faithful %>% ggvis(~waiting, ~eruptions) %>% layer_points()
faithful %>%
ggvis(~waiting, ~eruptions, size = ~eruptions) %>%
layer_points(opacity := 0.5, fill := "blue", stroke := "black")
faithful %>%
ggvis(~waiting, ~eruptions, fillOpacity = ~eruptions) %>%
layer_points(size := 100, fill := "red", stroke := "red", shape := "cross")
data(pressure)
str(pressure)
## 'data.frame': 19 obs. of 2 variables:
## $ temperature: num 0 20 40 60 80 100 120 140 160 180 ...
## $ pressure : num 0.0002 0.0012 0.006 0.03 0.09 0.27 0.75 1.85 4.2 8.8 ...
# Modify this graph to map the size property to the pressure variable
pressure %>% ggvis(~temperature, ~pressure, size = ~pressure) %>% layer_points()
# Modify this graph by setting the size property
pressure %>% ggvis(~temperature, ~pressure, size := 100) %>% layer_points()
# Fix this code to set the fill property to red
pressure %>% ggvis(~temperature, ~pressure, fill := "red") %>% layer_points()
pressure %>%
ggvis(~temperature, ~pressure) %>%
layer_lines(stroke := "red", strokeWidth := 2, strokeDash := 6)
# texas %>% ggvis(~long, ~lat) %>% layer_paths(fill := "darkorange")
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
mtcars %>% compute_smooth(mpg ~ wt)
## pred_ resp_
## 1 1.513000 32.08897
## 2 1.562506 31.68786
## 3 1.612013 31.28163
## 4 1.661519 30.87037
## 5 1.711025 30.45419
## 6 1.760532 30.03318
## 7 1.810038 29.60745
## 8 1.859544 29.17711
## 9 1.909051 28.74224
## 10 1.958557 28.30017
## 11 2.008063 27.83462
## 12 2.057570 27.34766
## 13 2.107076 26.84498
## 14 2.156582 26.33229
## 15 2.206089 25.81529
## 16 2.255595 25.29968
## 17 2.305101 24.79115
## 18 2.354608 24.29542
## 19 2.404114 23.81818
## 20 2.453620 23.36514
## 21 2.503127 22.95525
## 22 2.552633 22.61385
## 23 2.602139 22.32759
## 24 2.651646 22.08176
## 25 2.701152 21.86167
## 26 2.750658 21.65260
## 27 2.800165 21.43987
## 28 2.849671 21.20875
## 29 2.899177 20.95334
## 30 2.948684 20.71584
## 31 2.998190 20.49571
## 32 3.047696 20.28293
## 33 3.097203 20.06753
## 34 3.146709 19.83950
## 35 3.196215 19.58885
## 36 3.245722 19.29716
## 37 3.295228 18.94441
## 38 3.344734 18.56700
## 39 3.394241 18.20570
## 40 3.443747 17.90090
## 41 3.493253 17.62060
## 42 3.542759 17.34002
## 43 3.592266 17.07908
## 44 3.641772 16.81759
## 45 3.691278 16.55757
## 46 3.740785 16.30833
## 47 3.790291 16.07916
## 48 3.839797 15.87937
## 49 3.889304 15.70181
## 50 3.938810 15.52594
## 51 3.988316 15.35173
## 52 4.037823 15.17933
## 53 4.087329 15.00894
## 54 4.136835 14.84072
## 55 4.186342 14.67484
## 56 4.235848 14.51148
## 57 4.285354 14.35082
## 58 4.334861 14.19302
## 59 4.384367 14.03826
## 60 4.433873 13.88672
## 61 4.483380 13.73856
## 62 4.532886 13.59396
## 63 4.582392 13.45310
## 64 4.631899 13.31614
## 65 4.681405 13.18326
## 66 4.730911 13.05464
## 67 4.780418 12.93045
## 68 4.829924 12.81086
## 69 4.879430 12.69604
## 70 4.928937 12.58617
## 71 4.978443 12.48143
## 72 5.027949 12.38198
## 73 5.077456 12.28799
## 74 5.126962 12.19966
## 75 5.176468 12.11713
## 76 5.225975 12.04060
## 77 5.275481 11.97023
## 78 5.324987 11.90620
## 79 5.374494 11.84868
## 80 5.424000 11.79784
# Extend with ggvis() and layer_lines()
mtcars %>% compute_smooth(mpg ~ wt) %>% ggvis(~pred_, ~resp_) %>% layer_lines()
# Extend with layer_points() and layer_smooths()
mtcars %>% ggvis(~wt, ~mpg) %>% layer_points() %>% layer_smooths()
Behind the scenes, ggvis uses several compute functions to help with visualizations:
The ggvis library is especially well designed to interact with the dplyr library (Hadley Wickham):
Example code includes:
data(faithful)
str(faithful)
## 'data.frame': 272 obs. of 2 variables:
## $ eruptions: num 3.6 1.8 3.33 2.28 4.53 ...
## $ waiting : num 79 54 74 62 85 55 88 85 51 85 ...
faithful %>% ggvis(~waiting) %>% layer_histograms(width = 5)
# Finish the command
faithful %>%
compute_bin(~waiting, width = 5) %>%
ggvis(x = ~xmin_, x2 = ~xmax_, y = 0, y2 = ~count_) %>%
layer_rects()
# Build the density plot
faithful %>% ggvis(~waiting, fill := "green") %>% layer_densities()
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
mtcars %>%
ggvis(x = ~factor(cyl)) %>%
layer_bars()
# Instruction 1
mtcars %>%
group_by(cyl) %>%
ggvis(~mpg, ~wt, stroke = ~factor(cyl)) %>%
layer_smooths()
## Warning in rbind_all(out[[1]]): Unequal factor levels: coercing to
## character
# Instruction 2
mtcars %>%
group_by(cyl) %>%
ggvis(~mpg, fill = ~factor(cyl)) %>%
layer_densities()
## Warning in rbind_all(out[[1]]): Unequal factor levels: coercing to
## character
mtcars %>%
group_by(cyl, am) %>%
ggvis(~mpg, fill = ~interaction(cyl, am)) %>%
layer_densities()
## Warning in rbind_all(out[[1]]): Unequal factor levels: coercing to
## character
Can add interactivity to plots in ggvis:
Multi-layered ggvis plots:
Example code includes:
data(faithful)
str(faithful)
## 'data.frame': 272 obs. of 2 variables:
## $ eruptions: num 3.6 1.8 3.33 2.28 4.53 ...
## $ waiting : num 79 54 74 62 85 55 88 85 51 85 ...
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# Adapt the code: set fill with a select box
faithful %>%
ggvis(~waiting, ~eruptions, fillOpacity := 0.5,
shape := input_select(label = "Choose shape:",
choices = c("circle", "square", "cross",
"diamond", "triangle-up", "triangle-down"
)
),
fill := input_select(label = "Choose color:",
choices = c("black", "red", "blue", "green")
)
) %>%
layer_points()
## Warning: Can't output dynamic/interactive ggvis plots in a knitr document.
## Generating a static (non-dynamic, non-interactive) version of the plot.
# Add radio buttons to control the fill of the plot
mtcars %>%
ggvis(~mpg, ~wt,
fill := input_radiobuttons(label = "Choose color:",
choices = c("black", "red", "blue", "green")
)
) %>%
layer_points()
## Warning: Can't output dynamic/interactive ggvis plots in a knitr document.
## Generating a static (non-dynamic, non-interactive) version of the plot.
mtcars %>%
ggvis(~mpg, ~wt,
fill := input_text(label = "Choose color:", value = "black")) %>%
layer_points()
## Warning: Can't output dynamic/interactive ggvis plots in a knitr document.
## Generating a static (non-dynamic, non-interactive) version of the plot.
# Map the fill property to a select box that returns variable names
mtcars %>%
ggvis(~mpg, ~wt, fill = input_select(label = "Choose fill variable:",
choices = names(mtcars), map=as.name
)
) %>%
layer_points()
## Warning: Can't output dynamic/interactive ggvis plots in a knitr document.
## Generating a static (non-dynamic, non-interactive) version of the plot.
# Map the bindwidth to a numeric field ("Choose a binwidth:")
mtcars %>%
ggvis(~mpg) %>%
layer_histograms(width = input_numeric(label = "Choose a binwidth:", value = 1))
## Warning: Can't output dynamic/interactive ggvis plots in a knitr document.
## Generating a static (non-dynamic, non-interactive) version of the plot.
# Map the binwidth to a slider bar ("Choose a binwidth:") with the correct specifications
mtcars %>%
ggvis(~mpg) %>%
layer_histograms(width = input_slider(label = "Choose a binwidth:", 1, 20))
## Warning: Can't output dynamic/interactive ggvis plots in a knitr document.
## Generating a static (non-dynamic, non-interactive) version of the plot.
# Add a layer of points to the graph below.
pressure %>%
ggvis(~temperature, ~pressure, stroke := "skyblue") %>%
layer_lines() %>%
layer_points()
# Copy and adapt so that only the lines layer uses a skyblue stroke.
pressure %>%
ggvis(~temperature, ~pressure) %>%
layer_lines(stroke := "skyblue") %>%
layer_points()
# Rewrite the code below so that only the points layer uses the shape property.
pressure %>%
ggvis(~temperature, ~pressure) %>%
layer_lines(stroke := "skyblue") %>%
layer_points(shape := "triangle-up")
# Refactor the code for the graph below to make it as concise as possible
pressure %>%
ggvis(~temperature, ~pressure, stroke := "skyblue", strokeOpacity := 0.5, strokeWidth := 5) %>%
layer_lines() %>%
layer_points(fill = ~temperature,
shape := "triangle-up",
size := 300)
# Add more layers to the line plot
pressure %>%
ggvis(~temperature, ~pressure) %>%
layer_lines(opacity := 0.5) %>%
layer_points() %>%
layer_model_predictions(model = "lm", stroke := "navy") %>%
layer_smooths(stroke := "skyblue")
## Guessing formula = pressure ~ temperature
The add_axis() function can be used to change the titles and axis labels:
The add_legends() function can help with cleaning up legends (make them look tidier). This is similar to the arguments passed to the “adding an axis” above.
Can also customize the scales (relationships between data spaces and visual spaces) for the data:
Example code includes:
data(faithful)
str(faithful)
## 'data.frame': 272 obs. of 2 variables:
## $ eruptions: num 3.6 1.8 3.33 2.28 4.53 ...
## $ waiting : num 79 54 74 62 85 55 88 85 51 85 ...
# Defaulted axis
faithful %>%
ggvis(~waiting, ~eruptions) %>%
layer_points()
# Customized axis
faithful %>%
ggvis(~waiting, ~eruptions) %>%
layer_points() %>%
add_axis("x", title="Time since previous eruption (m)",
values=c(50, 60, 70, 80, 90), subdivide=9, orient="top"
) %>%
add_axis("y", title="Duration of eruption (m)", values=c(2, 3, 4, 5),
subdivide=9, orient="right"
)
data(pressure)
str(pressure)
## 'data.frame': 19 obs. of 2 variables:
## $ temperature: num 0 20 40 60 80 100 120 140 160 180 ...
## $ pressure : num 0.0002 0.0012 0.006 0.03 0.09 0.27 0.75 1.85 4.2 8.8 ...
# Add a legend
faithful %>%
ggvis(~waiting, ~eruptions, opacity := 0.6,
fill = ~factor(round(eruptions))) %>%
layer_points() %>%
add_legend("fill", title="~ duration (m)", orient="left")
# Original code with jumbled legends
faithful %>%
ggvis(~waiting, ~eruptions, opacity := 0.6,
fill = ~factor(round(eruptions)), shape = ~factor(round(eruptions)),
size = ~round(eruptions)) %>%
layer_points()
# Fix the legend
faithful %>%
ggvis(~waiting, ~eruptions, opacity := 0.6,
fill = ~factor(round(eruptions)), shape = ~factor(round(eruptions)),
size = ~round(eruptions)) %>%
layer_points() %>%
add_legend(c("fill", "shape", "size"), title="~ duration (m)")
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# Add a scale_numeric()
mtcars %>%
ggvis(~wt, ~mpg, fill = ~disp, stroke = ~disp, strokeWidth := 2) %>%
layer_points() %>%
scale_numeric("fill", range = c("red", "yellow")) %>%
scale_numeric("stroke", range = c("darkred", "orange"))
# Add a scale_numeric()
mtcars %>% ggvis(~wt, ~mpg, fill = ~hp) %>%
layer_points() %>%
scale_numeric("fill", range=c("green", "beige"))
# Add a scale_nominal()
mtcars %>% ggvis(~wt, ~mpg, fill = ~factor(cyl)) %>%
layer_points() %>%
scale_nominal("fill", range=c("purple", "blue", "green"))
# Original plot becomes too transparent
mtcars %>% ggvis(x = ~wt, y = ~mpg, fill = ~factor(cyl), opacity = ~hp) %>%
layer_points()
# Range to prevent overly transparent data points
mtcars %>% ggvis(x = ~wt, y = ~mpg, fill = ~factor(cyl), opacity = ~hp) %>%
layer_points() %>%
scale_numeric("opacity", range=c(0.2, 1))
mtcars %>% ggvis(~wt, ~mpg, fill = ~disp) %>%
layer_points() %>%
scale_numeric("y", domain = c(0, NA)) %>% # NA means top-of-data-range
scale_numeric("x", domain = c(0, 6))
mtcars$color <- c('red' , 'teal' , '#cccccc' , 'tan' , 'red' , 'teal' , '#cccccc' , 'tan' ,
'red' , 'teal' , '#cccccc' , 'tan' , 'red' , 'teal' , '#cccccc' , 'tan' ,
'red' , 'teal' , '#cccccc' , 'tan' , 'red' , 'teal' , '#cccccc' , 'tan' ,
'red' , 'teal' , '#cccccc' , 'tan' , 'red' , 'teal' , '#cccccc' , 'tan'
)
# Using fill by mapping the "color" variable to the ggvis scales
mtcars %>%
ggvis(x = ~wt, y = ~mpg, fill = ~color) %>%
layer_points()
# Using fill based directly on the values in the "color" variable
mtcars %>%
ggvis(x = ~wt, y = ~mpg, fill := ~color) %>%
layer_points()
Chapter 1 - Basic Mapping (ggplot2, ggmap)
Spatial data is data associated with locations (geo-spatial if it lies on the surface of the earth):
There are options to control what is returned by get_map (defualt is terrain map from google):
Common types of spatial data:
Key libraries are called without caching:
library(ggmap)
library(maps)
library(sp)
library(tmap)
## Warning: package 'tmap' was built under R version 3.2.5
## Warning: replacing previous import by 'tmaptools::%>%' when loading 'tmap'
And then, example code includes:
library(ggmap)
corvallis <- c(lon = -123.2620, lat = 44.5646)
# Get map at zoom level 5: map_5
map_5 <- get_map(corvallis, zoom = 5, scale = 1)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=44.5646,-123.262&zoom=5&size=640x640&scale=1&maptype=terrain&language=en-EN&sensor=false
# Plot map at zoom level 5
ggmap(map_5)
# Get map at zoom level 13: corvallis_map
corvallis_map <- get_map(corvallis, zoom = 13, scale = 1)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=44.5646,-123.262&zoom=13&size=640x640&scale=1&maptype=terrain&language=en-EN&sensor=false
# Plot map at zoom level 13
ggmap(corvallis_map)
sales_lon <- "-123.280349 ; -123.2330056 ; -123.263518 ; -123.259873 ; -123.263154 ; -123.284723 ; -123.236332 ; -123.266215 ; -123.2842478 ; -123.3094688 ; -123.260153 ; -123.292598 ; -123.2463352 ; -123.272113 ; -123.24968 ; -123.22785 ; -123.2759501 ; -123.300568 ; -123.230366 ; -123.274744"
sales_lat <- "44.578079 ; 44.5971776 ; 44.5692329 ; 44.594534 ; 44.536056 ; 44.598765 ; 44.603042 ; 44.593486 ; 44.5986974 ; 44.5519677 ; 44.595555 ; 44.594582 ; 44.5946242 ; 44.570883 ; 44.590523 ; 44.597 ; 44.5837454 ; 44.5546821 ; 44.590765 ; 44.572145"
sales_yrb <- "1967 ; 1990 ; 1948 ; 1978 ; 1979 ; 2002 ; 1972 ; 1970 ; 2002 ; 1959 ; 1998 ; 1971 ; 1998 ; 1938 ; 1963 ; 1972 ; 1967 ; 1952 ; 1965 ; 1926"
sales_beds <- "5 ; 3 ; 3 ; 0 ; 0 ; 3 ; 3 ; 3 ; 3 ; 3 ; 3 ; 5 ; 0 ; 6 ; 0 ; 4 ; 5 ; 3 ; 3 ; 3"
sales_prc <- "267500 ; 255000 ; 295000 ; 5000 ; 13950 ; 233000 ; 245000 ; 216000 ; 231000 ; 215000 ; 307000 ; 385000 ; 70000 ; 435350 ; 3000 ; 2e+05 ; 295900 ; 193000 ; 243000 ; 279900"
sales_finsq <- "1520 ; 1665 ; 1440 ; 784 ; 1344 ; 1567 ; 1174 ; 912 ; 1404 ; 1060 ; 1500 ; 1883 ; 1512 ; 2822 ; 540 ; 1855 ; 1960 ; 1316 ; 1872 ; 2278"
sales_class <- "Dwelling ; Dwelling ; Dwelling ; Mobile Home ; Mobile Home ; Dwelling ; Dwelling ; Dwelling ; Dwelling ; Dwelling ; Dwelling ; Dwelling ; Mobile Home ; Dwelling ; Mobile Home ; Dwelling ; Dwelling ; Dwelling ; Dwelling ; Dwelling"
sales_month <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 10, 8, 6, 4, 2, 1, 1)
sales <- data.frame(lon=as.numeric(strsplit(sales_lon, split=" ; ")[[1]]),
lat=as.numeric(strsplit(sales_lat, split=" ; ")[[1]]),
year_built=as.numeric(strsplit(sales_yrb, split=" ; ")[[1]]),
bedrooms=as.numeric(strsplit(sales_beds, split=" ; ")[[1]]),
price=as.numeric(strsplit(sales_prc, split=" ; ")[[1]]),
finished_squarefeet=as.numeric(strsplit(sales_finsq, split=" ; ")[[1]]),
class=strsplit(sales_class, split=" ; ")[[1]],
month=as.integer(sales_month)
)
# Look at head() of sales
head(sales)
## lon lat year_built bedrooms price finished_squarefeet
## 1 -123.2803 44.57808 1967 5 267500 1520
## 2 -123.2330 44.59718 1990 3 255000 1665
## 3 -123.2635 44.56923 1948 3 295000 1440
## 4 -123.2599 44.59453 1978 0 5000 784
## 5 -123.2632 44.53606 1979 0 13950 1344
## 6 -123.2847 44.59877 2002 3 233000 1567
## class month
## 1 Dwelling 1
## 2 Dwelling 2
## 3 Dwelling 3
## 4 Mobile Home 4
## 5 Mobile Home 5
## 6 Dwelling 6
# Swap out call to ggplot() with call to ggmap()
ggmap(corvallis_map) +
geom_point(aes(lon, lat), data = sales)
# Map color to year_built
ggmap(corvallis_map) +
geom_point(aes(lon, lat, color=year_built), data = sales)
# Map size to bedrooms
ggmap(corvallis_map) +
geom_point(aes(lon, lat, size=bedrooms), data = sales)
# Map color to price / finished_squarefeet
ggmap(corvallis_map) +
geom_point(aes(lon, lat, color=price/finished_squarefeet), data = sales)
corvallis <- c(lon = -123.2620, lat = 44.5646)
# Add a maptype argument to get a satellite map
corvallis_map_sat <- get_map(corvallis, zoom = 13, maptype="satellite")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=44.5646,-123.262&zoom=13&size=640x640&scale=2&maptype=satellite&language=en-EN&sensor=false
# Edit to display satellite map
ggmap(corvallis_map_sat) +
geom_point(aes(lon, lat, color = year_built), data = sales)
# Add source and maptype to get toner map from Stamen Maps
corvallis_map_bw <- get_map(corvallis, zoom = 13, maptype="toner", source="stamen")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=44.5646,-123.262&zoom=13&size=640x640&scale=2&maptype=terrain&sensor=false
## Map from URL : http://tile.stamen.com/toner/13/1289/2959.png
## Map from URL : http://tile.stamen.com/toner/13/1290/2959.png
## Map from URL : http://tile.stamen.com/toner/13/1291/2959.png
## Map from URL : http://tile.stamen.com/toner/13/1292/2959.png
## Map from URL : http://tile.stamen.com/toner/13/1289/2960.png
## Map from URL : http://tile.stamen.com/toner/13/1290/2960.png
## Map from URL : http://tile.stamen.com/toner/13/1291/2960.png
## Map from URL : http://tile.stamen.com/toner/13/1292/2960.png
## Map from URL : http://tile.stamen.com/toner/13/1289/2961.png
## Map from URL : http://tile.stamen.com/toner/13/1290/2961.png
## Map from URL : http://tile.stamen.com/toner/13/1291/2961.png
## Map from URL : http://tile.stamen.com/toner/13/1292/2961.png
## Map from URL : http://tile.stamen.com/toner/13/1289/2962.png
## Map from URL : http://tile.stamen.com/toner/13/1290/2962.png
## Map from URL : http://tile.stamen.com/toner/13/1291/2962.png
## Map from URL : http://tile.stamen.com/toner/13/1292/2962.png
# Edit to display toner map
ggmap(corvallis_map_bw) +
geom_point(aes(lon, lat, color = year_built), data = sales)
# Use base_layer argument to ggmap() to specify data and x, y mappings
ggmap(corvallis_map_bw, base_layer=ggplot(aes(lon, lat), data = sales)) +
geom_point(aes(color = year_built))
# Use base_layer argument to ggmap() and add facet_wrap()
ggmap(corvallis_map_bw, base_layer=ggplot(aes(lon, lat), data = sales)) +
geom_point(aes(color = class)) +
facet_wrap(~ class)
# Plot house sales using qmplot()
qmplot(lon, lat, data=sales, geom="point", color=bedrooms) +
facet_wrap(~ month)
## Using zoom = 13...
## Map from URL : http://tile.stamen.com/toner-lite/13/1289/2959.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1290/2959.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1291/2959.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1289/2960.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1290/2960.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1291/2960.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1289/2961.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1290/2961.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1291/2961.png
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
## *** NEED TO COMMENT OUT (do not have ward_sales data file)
# Add a point layer with color mapped to ward
# ggplot(ward_sales, aes(lon, lat)) +
# geom_point(aes(color=ward))
# Add a point layer with color mapped to group
# ggplot(ward_sales, aes(lon, lat)) +
# geom_point(aes(color=group))
# Add a path layer with group mapped to group
# ggplot(ward_sales, aes(lon, lat)) +
# geom_path(aes(group=group))
# Add a polygon layer with fill mapped to ward, and group to group
# ggplot(ward_sales, aes(lon, lat)) +
# geom_polygon(aes(fill=ward, group=group))
# Fix the polygon cropping
# ggmap(corvallis_map_bw,
# base_layer = ggplot(ward_sales, aes(lon, lat)),
# extent = "normal", maprange=FALSE
# ) +
# geom_polygon(aes(group = group, fill = ward))
# Repeat, but map fill to num_sales
# ggmap(corvallis_map_bw,
# base_layer = ggplot(ward_sales, aes(lon, lat)),
# extent = "normal", maprange=FALSE
# ) +
# geom_polygon(aes(group = group, fill = num_sales))
# Repeat again, but map fill to avg_price
# ggmap(corvallis_map_bw,
# base_layer = ggplot(ward_sales, aes(lon, lat)),
# extent = "normal", maprange=FALSE
# ) +
# geom_polygon(aes(group = group, fill = avg_price), alpha=0.8)
# Add a geom_point() layer
# ggplot(preds, aes(lon, lat)) + geom_point()
# Add a tile layer with fill mapped to predicted_price
# ggplot(preds, aes(lon, lat)) + geom_tile(aes(fill=predicted_price))
# Use ggmap() instead of ggplot()
# ggmap(corvallis_map_bw) + geom_tile(data=preds, aes(lon, lat, fill=predicted_price), alpha=0.8)
Chapter 2 - Point and Polygon Data
Introducing sp objects, preferred to data frames for spatial objects, which contain the CRS (coordinate reference system):
Introducing the sp and S4 objects (more details on the spdf from the exercises):
Additional information on the sp class and methods:
Introduction to tmap, which is very similar in concept to ggplot2:
Example code includes:
# Create the countries_sp dataset
library(maps)
origCountry <- maps::map("world", fill = TRUE)
IDs <- sapply(strsplit(origCountry$names, ":"), function(x) x[1])
countries_sp <- maptools::map2SpatialPolygons(origCountry, IDs=IDs,
proj4string=CRS("+proj=longlat +datum=WGS84")
)
library(sp)
# Print countries_sp (skip this, it looks like a pile of vomit!)
# print(countries_sp)
# Call summary() on countries_sp
summary(countries_sp)
## Object of class SpatialPolygons
## Coordinates:
## min max
## x -180.00000 190.27084
## y -85.19218 83.59961
## Is projected: FALSE
## proj4string :
## [+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0]
# Call plot() on countries_sp
plot(countries_sp)
# Call str() on countries_sp (skip this, it is a wreck, the below is much more user-friendly)
# str(countries_sp)
# Call str() on countries_sp with max.level = 2
str(countries_sp, max.level=2)
## Formal class 'SpatialPolygons' [package "sp"] with 4 slots
## ..@ polygons :List of 253
## .. .. [list output truncated]
## ..@ plotOrder : int [1:253] 8 184 40 241 48 33 14 86 114 10 ...
## ..@ bbox : num [1:2, 1:2] -180 -85.2 190.3 83.6
## .. ..- attr(*, "dimnames")=List of 2
## ..@ proj4string:Formal class 'CRS' [package "sp"] with 1 slot
## *** MAY NEED TO SKIP UNLESS CAN FIND/CREATE THE spdf ["name", "iso_a3", "population", "gdp", "region", "subregion"]
ctry_db <- data.frame(ctr=1:length(countries_sp@polygons),
name=NA, iso_a3=NA, population=NA,
gdp=NA, region=NA, subregion=NA
)
for (intCtr in 1:nrow(ctry_db)) {
ctry_db$name[intCtr] <- countries_sp@polygons[[intCtr]]@ID
}
a3List <- "AFG\nALB\nDZA\nASM\nAND\nAGO\nAIA\nATA\nATG\nARG\nARM\nABW\n#N/A\nAUS\nAUT\nAZE\n#N/A\nBHS\nBHR\nBGD\nBRB\n#N/A\nBLR\nBEL\nBLZ\nBEN\nBMU\nBTN\nBOL\n#N/A\nBIH\nBWA\nBRA\nBRN\nBGR\nBFA\nBDI\nKHM\nCMR\nCAN\n#N/A\nCPV\nCYM\nCAF\nTCD\n#N/A\nCHL\nCHN\nCXR\nCCK\nCOL\nCOM\nCOK\nCRI\nHRV\nCUB\nCUW\nCYP\nCZE\nCOD\nDNK\nDJI\nDMA\nDOM\nECU\nEGY\nSLV\nGNQ\nERI\nEST\nETH\nFLK\nFRO\nFJI\nFIN\nFRA\nGUF\nPYF\nATF\nGAB\nGMB\nGEO\nDEU\nGHA\nGRC\nGRL\nGRD\n#N/A\nGLP\nGUM\nGTM\nGGY\nGIN\nGNB\nGUY\nHTI\nHMD\nHND\nHUN\nISL\nIND\nIDN\nIRN\nIRQ\nIRL\nIMN\nISR\nITA\nCIV\nJAM\nJPN\nJEY\nJOR\nKAZ\nKEN\nKIR\n#N/A\nKWT\nKGZ\nLAO\nLVA\nLBN\nLSO\nLBR\nLBY\nLIE\nLTU\nLUX\nMKD\nMDG\n#N/A\nMWI\nMYS\nMDV\nMLI\nMLT\nMHL\nMTQ\nMRT\nMUS\nMYT\nMEX\nFSM\nMDA\nMCO\nMNG\nMNE\nMSR\nMAR\nMOZ\nMMR\nNAM\nNRU\nNPL\nNLD\n#N/A\nNCL\nNZL\nNIC\nNER\nNGA\nNIU\nNFK\nPRK\nMNP\nNOR\nOMN\nPAK\nPLW\nPSE\nPAN\nPNG\nPRY\nPER\nPHL\nPCN\nPOL\nPRT\nPRI\nQAT\nCOG\nREU\nROU\nRUS\nRWA\nBES\nBLM\nSHN\nKNA\nLCA\nMAF\nSPM\nVCT\nWSM\nSMR\nSTP\nSAU\nSEN\nSRB\nSYC\n#N/A\nSLE\nSGP\n#N/A\nSXM\nSVK\nSVN\nSLB\nSOM\nZAF\nSGS\nKOR\n#N/A\nSSD\nESP\nLKA\nSDN\nSUR\nSWZ\nSWE\nCHE\nSYR\nTWN\nTJK\nTZA\nTHA\nTLS\n#N/A\nTGO\nTON\nTTO\nTUN\nTUR\nTKM\nTCA\nUGA\nGBR\nUKR\nARE\nURY\nUSA\nUZB\nVUT\nVAT\nVEN\nVNM\nVGB\nVIR\nWLF\nESH\nYEM\nZMB\nZWE\n"
a3Split <- strsplit(a3List, "\n")[[1]]
a3Split[a3Split == "#N/A"] <- NA
ctry_db$iso_a3 <- a3Split
popList <- "33369945\n2903700\n40375954\n55602\n69165\n25830958\n14763\n#N/A\n92738\n43847277\n3026048\n104263\n#N/A\n24309330\n8569633\n9868447\n#N/A\n392718\n1396829\n162910864\n285006\n#N/A\n9481521\n11371928\n366942\n11166658\n61662\n784103\n10888402\n#N/A\n3802134\n2303820\n209567920\n428874\n7097796\n18633725\n11552561\n15827241\n23924407\n36286378\n#N/A\n526993\n60764\n4998493\n14496739\n#N/A\n18131850\n1382323332\n#N/A\n#N/A\n48654392\n807118\n20948\n4857218\n4225001\n11392889\n158635\n1176598\n10548058\n79722624\n5690750\n899598\n73016\n10648613\n16385450\n93383574\n6146419\n869587\n5351680\n1309104\n101853268\n2912\n48239\n897537\n5523904\n64668129\n275688\n285735\n#N/A\n1763142\n2054986\n3979781\n80682351\n28033375\n10919459\n56196\n107327\n#N/A\n470547\n172094\n16672956\n164466\n12947122\n1888429\n770610\n10848175\n#N/A\n8189501\n9821318\n331778\n1326801576\n260581100\n80043146\n37547686\n4713993\n88421\n8192463\n59801004\n23254184\n2803362\n126323715\n#N/A\n7747800\n17855384\n47251449\n114405\n#N/A\n4007146\n6033769\n6918367\n1955742\n5988153\n2160309\n4615222\n6330159\n37776\n2850030\n576243\n2081012\n24915822\n#N/A\n17749826\n30751602\n369812\n18134835\n419615\n53069\n396364\n4166463\n1277459\n246496\n128632004\n#N/A\n4062862\n37863\n3006444\n626101\n5154\n34817065\n28751362\n54363426\n2513981\n10263\n28850717\n16979729\n#N/A\n266431\n4565185\n6150035\n20715285\n186987563\n1612\n#N/A\n25281327\n55389\n5271958\n4654471\n192826502\n21501\n4797239\n3990406\n7776115\n6725430\n31774225\n102250133\n#N/A\n38593161\n10304434\n3680772\n2291368\n4740992\n867214\n19372734\n143439832\n11882766\n#N/A\n#N/A\n3956\n56183\n186383\n#N/A\n6301\n109644\n194523\n31950\n194390\n32157974\n15589485\n8812705\n97026\n#N/A\n6592102\n5696506\n#N/A\n39538\n5429418\n2069362\n594934\n11079013\n54978907\n#N/A\n50503933\n#N/A\n12733427\n46064604\n20810816\n41175541\n547610\n1304063\n9851852\n8379477\n18563595\n23395600\n8669464\n55155473\n68146609\n1211245\n#N/A\n7496833\n106915\n1364973\n11375220\n79622062\n5438670\n34904\n40322768\n65111143\n44624373\n9266971\n3444071\n324118787\n30300446\n270470\n801\n31518855\n94444200\n30659\n106415\n13112\n584206\n27477600\n16717332\n15966810\n"
popSplit <- strsplit(popList, "\n")[[1]]
popSplit[popSplit == "#N/A"] <- NA
ctry_db$population <- as.numeric(popSplit)
rgnList <- "Asia\nEurope\nAfrica\nOceania\nEurope\nAfrica\nAmericas\n#N/A\nAmericas\nAmericas\nAsia\nAmericas\n#N/A\nOceania\nEurope\nAsia\n#N/A\nAmericas\nAsia\nAsia\nAmericas\n#N/A\nEurope\nEurope\nAmericas\nAfrica\nAmericas\nAsia\nAmericas\n#N/A\nEurope\nAfrica\nAmericas\nAsia\nEurope\nAfrica\nAfrica\nAsia\nAfrica\nAmericas\n#N/A\nAfrica\nAmericas\nAfrica\nAfrica\n#N/A\nAmericas\nAsia\n#N/A\n#N/A\nAmericas\nAfrica\nOceania\nAmericas\nEurope\nAmericas\nAmericas\nEurope\nEurope\nAfrica\nEurope\nAfrica\nAmericas\nAmericas\nAmericas\nAfrica\nAmericas\nAfrica\nAfrica\nEurope\nAfrica\nAmericas\nEurope\nOceania\nEurope\nEurope\nAmericas\nOceania\n#N/A\nAfrica\nAfrica\nAsia\nEurope\nAfrica\nEurope\nAmericas\nAmericas\n#N/A\nAmericas\nOceania\nAmericas\nEurope\nAfrica\nAfrica\nAmericas\nAmericas\n#N/A\nAmericas\nEurope\nEurope\nAsia\nAsia\nAsia\nAsia\nEurope\nEurope\nAsia\nEurope\nAfrica\nAmericas\nAsia\n#N/A\nAsia\nAsia\nAfrica\nOceania\n#N/A\nAsia\nAsia\nAsia\nEurope\nAsia\nAfrica\nAfrica\nAfrica\nEurope\nEurope\nEurope\nEurope\nAfrica\n#N/A\nAfrica\nAsia\nAsia\nAfrica\nEurope\nOceania\nAmericas\nAfrica\nAfrica\nAfrica\nAmericas\n#N/A\nEurope\nEurope\nAsia\nEurope\nAmericas\nAfrica\nAfrica\nAsia\nAfrica\nOceania\nAsia\nEurope\n#N/A\nOceania\nOceania\nAmericas\nAfrica\nAfrica\nOceania\n#N/A\nAsia\nOceania\nEurope\nAsia\nAsia\nOceania\nAsia\nAmericas\nOceania\nAmericas\nAmericas\nAsia\n#N/A\nEurope\nEurope\nAmericas\nAsia\nAfrica\nAfrica\nEurope\nEurope\nAfrica\n#N/A\n#N/A\nAfrica\nAmericas\nAmericas\n#N/A\nAmericas\nAmericas\nOceania\nEurope\nAfrica\nAsia\nAfrica\nEurope\nAfrica\n#N/A\nAfrica\nAsia\n#N/A\nAmericas\nEurope\nEurope\nOceania\nAfrica\nAfrica\n#N/A\nAsia\n#N/A\nAfrica\nEurope\nAsia\nAfrica\nAmericas\nAfrica\nEurope\nEurope\nAsia\nAsia\nAsia\nAfrica\nAsia\nAsia\n#N/A\nAfrica\nOceania\nAmericas\nAfrica\nAsia\nAsia\nAmericas\nAfrica\nEurope\nEurope\nAsia\nAmericas\nAmericas\nAsia\nOceania\nEurope\nAmericas\nAsia\nAmericas\nAmericas\nOceania\nAfrica\nAsia\nAfrica\nAfrica\n"
rgnSplit <- strsplit(rgnList, "\n")[[1]]
rgnSplit[rgnSplit == "#N/A"] <- NA
ctry_db$region <- rgnSplit
subList <- "Southern Asia\nSouthern Europe\nNorthern Africa\nPolynesia\nSouthern Europe\nMiddle Africa\nCaribbean\n#N/A\nCaribbean\nSouth America\nWestern Asia\nCaribbean\n#N/A\nAustralia and New Zealand\nWestern Europe\nWestern Asia\n#N/A\nCaribbean\nWestern Asia\nSouthern Asia\nCaribbean\n#N/A\nEastern Europe\nWestern Europe\nCentral America\nWestern Africa\nNorthern America\nSouthern Asia\nSouth America\n#N/A\nSouthern Europe\nSouthern Africa\nSouth America\nSouth-Eastern Asia\nEastern Europe\nWestern Africa\nEastern Africa\nSouth-Eastern Asia\nMiddle Africa\nNorthern America\n#N/A\nWestern Africa\nCaribbean\nMiddle Africa\nMiddle Africa\n#N/A\nSouth America\nEastern Asia\n#N/A\n#N/A\nSouth America\nEastern Africa\nPolynesia\nCentral America\nSouthern Europe\nCaribbean\nCaribbean\nSouthern Europe\nEastern Europe\nMiddle Africa\nNorthern Europe\nEastern Africa\nCaribbean\nCaribbean\nSouth America\nNorthern Africa\nCentral America\nMiddle Africa\nEastern Africa\nNorthern Europe\nEastern Africa\nSouth America\nNorthern Europe\nMelanesia\nNorthern Europe\nWestern Europe\nSouth America\nPolynesia\n#N/A\nMiddle Africa\nWestern Africa\nWestern Asia\nWestern Europe\nWestern Africa\nSouthern Europe\nNorthern America\nCaribbean\n#N/A\nCaribbean\nMicronesia\nCentral America\nNorthern Europe\nWestern Africa\nWestern Africa\nSouth America\nCaribbean\n#N/A\nCentral America\nEastern Europe\nNorthern Europe\nSouthern Asia\nSouth-Eastern Asia\nSouthern Asia\nWestern Asia\nNorthern Europe\nNorthern Europe\nWestern Asia\nSouthern Europe\nWestern Africa\nCaribbean\nEastern Asia\n#N/A\nWestern Asia\nCentral Asia\nEastern Africa\nMicronesia\n#N/A\nWestern Asia\nCentral Asia\nSouth-Eastern Asia\nNorthern Europe\nWestern Asia\nSouthern Africa\nWestern Africa\nNorthern Africa\nWestern Europe\nNorthern Europe\nWestern Europe\nSouthern Europe\nEastern Africa\n#N/A\nEastern Africa\nSouth-Eastern Asia\nSouthern Asia\nWestern Africa\nSouthern Europe\nMicronesia\nCaribbean\nWestern Africa\nEastern Africa\nEastern Africa\nCentral America\n#N/A\nEastern Europe\nWestern Europe\nEastern Asia\nSouthern Europe\nCaribbean\nNorthern Africa\nEastern Africa\nSouth-Eastern Asia\nSouthern Africa\nMicronesia\nSouthern Asia\nWestern Europe\n#N/A\nMelanesia\nAustralia and New Zealand\nCentral America\nWestern Africa\nWestern Africa\nPolynesia\n#N/A\nEastern Asia\nMicronesia\nNorthern Europe\nWestern Asia\nSouthern Asia\nMicronesia\nWestern Asia\nCentral America\nMelanesia\nSouth America\nSouth America\nSouth-Eastern Asia\n#N/A\nEastern Europe\nSouthern Europe\nCaribbean\nWestern Asia\nMiddle Africa\nEastern Africa\nEastern Europe\nEastern Europe\nEastern Africa\n#N/A\n#N/A\nWestern Africa\nCaribbean\nCaribbean\n#N/A\nNorthern America\nCaribbean\nPolynesia\nSouthern Europe\nMiddle Africa\nWestern Asia\nWestern Africa\nSouthern Europe\nEastern Africa\n#N/A\nWestern Africa\nSouth-Eastern Asia\n#N/A\nCaribbean\nEastern Europe\nSouthern Europe\nMelanesia\nEastern Africa\nSouthern Africa\n#N/A\nEastern Asia\n#N/A\nEastern Africa\nSouthern Europe\nSouthern Asia\nNorthern Africa\nSouth America\nSouthern Africa\nNorthern Europe\nWestern Europe\nWestern Asia\nEastern Asia\nCentral Asia\nEastern Africa\nSouth-Eastern Asia\nSouth-Eastern Asia\n#N/A\nWestern Africa\nPolynesia\nCaribbean\nNorthern Africa\nWestern Asia\nCentral Asia\nCaribbean\nEastern Africa\nNorthern Europe\nEastern Europe\nWestern Asia\nSouth America\nNorthern America\nCentral Asia\nMelanesia\nSouthern Europe\nSouth America\nSouth-Eastern Asia\nCaribbean\nCaribbean\nPolynesia\nNorthern Africa\nWestern Asia\nEastern Africa\nEastern Africa\n"
subSplit <- strsplit(subList, "\n")[[1]]
subSplit[subSplit == "#N/A"] <- NA
ctry_db$subregion <- subSplit
gdpList <- "18395\n12144\n168318\n#N/A\n#N/A\n91939\n#N/A\n#N/A\n1303\n541748\n10754\n#N/A\n#N/A\n1256640\n387299\n35686\n#N/A\n9047\n31823\n226760\n4473\n#N/A\n48126\n470179\n1770\n8930\n#N/A\n2085\n35699\n#N/A\n16532\n10948\n1769601\n10458\n50446\n12006\n2742\n19368\n30870\n1532343\n#N/A\n1684\n#N/A\n1782\n10441\n#N/A\n234903\n11391619\n#N/A\n#N/A\n274135\n622\n#N/A\n57689\n49855\n#N/A\n#N/A\n19931\n193535\n39820\n302571\n1894\n524\n71457\n99118\n346565\n26610\n11638\n5352\n23476\n69218\n#N/A\n#N/A\n4556\n239186\n2488284\n#N/A\n#N/A\n#N/A\n14563\n886\n14463\n3494898\n42761\n195878\n#N/A\n1028\n#N/A\n#N/A\n#N/A\n68389\n#N/A\n6754\n1168\n3456\n8259\n#N/A\n20930\n117065\n19444\n2250987\n940953\n412304\n156323\n307917\n#N/A\n311739\n1852499\n34649\n13779\n4730300\n#N/A\n39453\n128109\n69170\n166\n6560\n110455\n5794\n13761\n27945\n51815\n1806\n2168\n39389\n#N/A\n42776\n60984\n10492\n9740\n#N/A\n5474\n302748\n3270\n14103\n10463\n188\n#N/A\n4718\n11740\n#N/A\n1063606\n325\n6650\n#N/A\n11164\n4242\n#N/A\n104908\n12045\n68277\n10183\n#N/A\n21154\n769930\n#N/A\n#N/A\n179359\n13413\n7566\n415080\n#N/A\n#N/A\n#N/A\n#N/A\n376268\n59675\n284519\n296\n#N/A\n55227\n19915\n27323\n180291\n311687\n#N/A\n467350\n205860\n100852\n156595\n8834\n#N/A\n186514\n1267754\n8341\n#N/A\n#N/A\n#N/A\n955\n1439\n#N/A\n#N/A\n766\n876\n1556\n351\n637785\n14870\n37755\n1419\n#N/A\n4289\n296642\n#N/A\n#N/A\n90263\n44122\n1218\n#N/A\n280367\n#N/A\n1404383\n#N/A\n2628\n1252163\n82239\n94297\n4137\n3430\n517440\n662483\n#VALUE!\n519149\n6612\n46695\n390592\n2501\n#N/A\n4520\n430\n22809\n42388\n735716\n36573\n#N/A\n25613\n2649893\n87198\n375022\n54374\n18561934\n66797\n773\n#N/A\n333715\n200493\n#N/A\n#N/A\n#N/A\n#N/A\n31326\n20574\n14193\n"
gdpSplit <- strsplit(gdpList, "\n")[[1]]
gdpSplit[gdpSplit == "#N/A"] <- NA
ctry_db$gdp <- as.numeric(gdpSplit)
## Warning: NAs introduced by coercion
# Create the spdf file
rownames(ctry_db) <- ctry_db$name
countries_spdf <- SpatialPolygonsDataFrame(countries_sp, data=ctry_db)
# Call summary() on countries_spdf and countries_sp
summary(countries_sp)
## Object of class SpatialPolygons
## Coordinates:
## min max
## x -180.00000 190.27084
## y -85.19218 83.59961
## Is projected: FALSE
## proj4string :
## [+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0]
summary(countries_spdf)
## Object of class SpatialPolygonsDataFrame
## Coordinates:
## min max
## x -180.00000 190.27084
## y -85.19218 83.59961
## Is projected: FALSE
## proj4string :
## [+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0]
## Data attributes:
## ctr name iso_a3
## Min. : 1 NULL:Afghanistan NULL:AFG
## 1st Qu.: 64 NULL:Albania NULL:ALB
## Median :127 NULL:Algeria NULL:DZA
## Mean :127 NULL:American Samoa NULL:ASM
## 3rd Qu.:190 NULL:Andorra NULL:AND
## Max. :253 NULL:Angola NULL:AGO
## NULL:Anguilla NULL:AIA
## NULL:Antarctica NULL:ATA
## NULL:Antigua NULL:ATG
## NULL:Argentina NULL:ARG
## NULL:Armenia NULL:ARM
## NULL:Aruba NULL:ABW
## NULL:Ascension Island NULL:NA
## NULL:Australia NULL:AUS
## NULL:Austria NULL:AUT
## NULL:Azerbaijan NULL:AZE
## NULL:Azores NULL:NA
## NULL:Bahamas NULL:BHS
## NULL:Bahrain NULL:BHR
## NULL:Bangladesh NULL:BGD
## NULL:Barbados NULL:BRB
## NULL:Barbuda NULL:NA
## NULL:Belarus NULL:BLR
## NULL:Belgium NULL:BEL
## NULL:Belize NULL:BLZ
## NULL:Benin NULL:BEN
## NULL:Bermuda NULL:BMU
## NULL:Bhutan NULL:BTN
## NULL:Bolivia NULL:BOL
## NULL:Bonaire NULL:NA
## NULL:Bosnia and Herzegovina NULL:BIH
## NULL:Botswana NULL:BWA
## NULL:Brazil NULL:BRA
## NULL:Brunei NULL:BRN
## NULL:Bulgaria NULL:BGR
## NULL:Burkina Faso NULL:BFA
## NULL:Burundi NULL:BDI
## NULL:Cambodia NULL:KHM
## NULL:Cameroon NULL:CMR
## NULL:Canada NULL:CAN
## NULL:Canary Islands NULL:NA
## NULL:Cape Verde NULL:CPV
## NULL:Cayman Islands NULL:CYM
## NULL:Central African Republic NULL:CAF
## NULL:Chad NULL:TCD
## NULL:Chagos Archipelago NULL:NA
## NULL:Chile NULL:CHL
## NULL:China NULL:CHN
## NULL:Christmas Island NULL:CXR
## NULL:Cocos Islands NULL:CCK
## NULL:Colombia NULL:COL
## NULL:Comoros NULL:COM
## NULL:Cook Islands NULL:COK
## NULL:Costa Rica NULL:CRI
## NULL:Croatia NULL:HRV
## NULL:Cuba NULL:CUB
## NULL:Curacao NULL:CUW
## NULL:Cyprus NULL:CYP
## NULL:Czech Republic NULL:CZE
## NULL:Democratic Republic of the Congo NULL:COD
## NULL:Denmark NULL:DNK
## NULL:Djibouti NULL:DJI
## NULL:Dominica NULL:DMA
## NULL:Dominican Republic NULL:DOM
## NULL:Ecuador NULL:ECU
## NULL:Egypt NULL:EGY
## NULL:El Salvador NULL:SLV
## NULL:Equatorial Guinea NULL:GNQ
## NULL:Eritrea NULL:ERI
## NULL:Estonia NULL:EST
## NULL:Ethiopia NULL:ETH
## NULL:Falkland Islands NULL:FLK
## NULL:Faroe Islands NULL:FRO
## NULL:Fiji NULL:FJI
## NULL:Finland NULL:FIN
## NULL:France NULL:FRA
## NULL:French Guiana NULL:GUF
## NULL:French Polynesia NULL:PYF
## NULL:French Southern and Antarctic Lands NULL:ATF
## NULL:Gabon NULL:GAB
## NULL:Gambia NULL:GMB
## NULL:Georgia NULL:GEO
## NULL:Germany NULL:DEU
## NULL:Ghana NULL:GHA
## NULL:Greece NULL:GRC
## NULL:Greenland NULL:GRL
## NULL:Grenada NULL:GRD
## NULL:Grenadines NULL:NA
## NULL:Guadeloupe NULL:GLP
## NULL:Guam NULL:GUM
## NULL:Guatemala NULL:GTM
## NULL:Guernsey NULL:GGY
## NULL:Guinea NULL:GIN
## NULL:Guinea-Bissau NULL:GNB
## NULL:Guyana NULL:GUY
## NULL:Haiti NULL:HTI
## NULL:Heard Island NULL:HMD
## NULL:Honduras NULL:HND
## NULL:Hungary NULL:HUN
## NULL:Iceland NULL:ISL
## NULL:India NULL:IND
## NULL:Indonesia NULL:IDN
## NULL:Iran NULL:IRN
## NULL:Iraq NULL:IRQ
## NULL:Ireland NULL:IRL
## NULL:Isle of Man NULL:IMN
## NULL:Israel NULL:ISR
## NULL:Italy NULL:ITA
## NULL:Ivory Coast NULL:CIV
## NULL:Jamaica NULL:JAM
## NULL:Japan NULL:JPN
## NULL:Jersey NULL:JEY
## NULL:Jordan NULL:JOR
## NULL:Kazakhstan NULL:KAZ
## NULL:Kenya NULL:KEN
## NULL:Kiribati NULL:KIR
## NULL:Kosovo NULL:NA
## NULL:Kuwait NULL:KWT
## NULL:Kyrgyzstan NULL:KGZ
## NULL:Laos NULL:LAO
## NULL:Latvia NULL:LVA
## NULL:Lebanon NULL:LBN
## NULL:Lesotho NULL:LSO
## NULL:Liberia NULL:LBR
## NULL:Libya NULL:LBY
## NULL:Liechtenstein NULL:LIE
## NULL:Lithuania NULL:LTU
## NULL:Luxembourg NULL:LUX
## NULL:Macedonia NULL:MKD
## NULL:Madagascar NULL:MDG
## NULL:Madeira Islands NULL:NA
## NULL:Malawi NULL:MWI
## NULL:Malaysia NULL:MYS
## NULL:Maldives NULL:MDV
## NULL:Mali NULL:MLI
## NULL:Malta NULL:MLT
## NULL:Marshall Islands NULL:MHL
## NULL:Martinique NULL:MTQ
## NULL:Mauritania NULL:MRT
## NULL:Mauritius NULL:MUS
## NULL:Mayotte NULL:MYT
## NULL:Mexico NULL:MEX
## NULL:Micronesia NULL:FSM
## NULL:Moldova NULL:MDA
## NULL:Monaco NULL:MCO
## NULL:Mongolia NULL:MNG
## NULL:Montenegro NULL:MNE
## NULL:Montserrat NULL:MSR
## NULL:Morocco NULL:MAR
## NULL:Mozambique NULL:MOZ
## NULL:Myanmar NULL:MMR
## NULL:Namibia NULL:NAM
## NULL:Nauru NULL:NRU
## NULL:Nepal NULL:NPL
## NULL:Netherlands NULL:NLD
## NULL:Nevis NULL:NA
## NULL:New Caledonia NULL:NCL
## NULL:New Zealand NULL:NZL
## NULL:Nicaragua NULL:NIC
## NULL:Niger NULL:NER
## NULL:Nigeria NULL:NGA
## NULL:Niue NULL:NIU
## NULL:Norfolk Island NULL:NFK
## NULL:North Korea NULL:PRK
## NULL:Northern Mariana Islands NULL:MNP
## NULL:Norway NULL:NOR
## NULL:Oman NULL:OMN
## NULL:Pakistan NULL:PAK
## NULL:Palau NULL:PLW
## NULL:Palestine NULL:PSE
## NULL:Panama NULL:PAN
## NULL:Papua New Guinea NULL:PNG
## NULL:Paraguay NULL:PRY
## NULL:Peru NULL:PER
## NULL:Philippines NULL:PHL
## NULL:Pitcairn Islands NULL:PCN
## NULL:Poland NULL:POL
## NULL:Portugal NULL:PRT
## NULL:Puerto Rico NULL:PRI
## NULL:Qatar NULL:QAT
## NULL:Republic of Congo NULL:COG
## NULL:Reunion NULL:REU
## NULL:Romania NULL:ROU
## NULL:Russia NULL:RUS
## NULL:Rwanda NULL:RWA
## NULL:Saba NULL:BES
## NULL:Saint Barthelemy NULL:BLM
## NULL:Saint Helena NULL:SHN
## NULL:Saint Kitts NULL:KNA
## NULL:Saint Lucia NULL:LCA
## NULL:Saint Martin NULL:MAF
## NULL:Saint Pierre and Miquelon NULL:SPM
## NULL:Saint Vincent NULL:VCT
## NULL:Samoa NULL:WSM
## NULL:San Marino NULL:SMR
## NULL:Sao Tome and Principe NULL:STP
## NULL:Saudi Arabia NULL:SAU
## NULL:Senegal NULL:SEN
## NULL:Serbia NULL:SRB
## NULL:Seychelles NULL:SYC
## NULL:Siachen Glacier NULL:NA
## NULL:Sierra Leone NULL:SLE
## NULL:Singapore NULL:SGP
## NULL:Sint Eustatius NULL:NA
## NULL:Sint Maarten NULL:SXM
## NULL:Slovakia NULL:SVK
## NULL:Slovenia NULL:SVN
## NULL:Solomon Islands NULL:SLB
## NULL:Somalia NULL:SOM
## NULL:South Africa NULL:ZAF
## NULL:South Georgia NULL:SGS
## NULL:South Korea NULL:KOR
## NULL:South Sandwich Islands NULL:NA
## NULL:South Sudan NULL:SSD
## NULL:Spain NULL:ESP
## NULL:Sri Lanka NULL:LKA
## NULL:Sudan NULL:SDN
## NULL:Suriname NULL:SUR
## NULL:Swaziland NULL:SWZ
## NULL:Sweden NULL:SWE
## NULL:Switzerland NULL:CHE
## NULL:Syria NULL:SYR
## NULL:Taiwan NULL:TWN
## NULL:Tajikistan NULL:TJK
## NULL:Tanzania NULL:TZA
## NULL:Thailand NULL:THA
## NULL:Timor-Leste NULL:TLS
## NULL:Tobago NULL:NA
## NULL:Togo NULL:TGO
## NULL:Tonga NULL:TON
## NULL:Trinidad NULL:TTO
## NULL:Tunisia NULL:TUN
## NULL:Turkey NULL:TUR
## NULL:Turkmenistan NULL:TKM
## NULL:Turks and Caicos Islands NULL:TCA
## NULL:Uganda NULL:UGA
## NULL:UK NULL:GBR
## NULL:Ukraine NULL:UKR
## NULL:United Arab Emirates NULL:ARE
## NULL:Uruguay NULL:URY
## NULL:USA NULL:USA
## NULL:Uzbekistan NULL:UZB
## NULL:Vanuatu NULL:VUT
## NULL:Vatican NULL:VAT
## NULL:Venezuela NULL:VEN
## NULL:Vietnam NULL:VNM
## NULL:Virgin Islands, British NULL:VGB
## NULL:Virgin Islands, US NULL:VIR
## NULL:Wallis and Futuna NULL:WLF
## NULL:Western Sahara NULL:ESH
## NULL:Yemen NULL:YEM
## NULL:Zambia NULL:ZMB
## NULL:Zimbabwe NULL:ZWE
## population gdp region
## Min. :8.010e+02 Min. : 166 NULL:Asia
## 1st Qu.:5.321e+05 1st Qu.: 7160 NULL:Europe
## Median :5.607e+06 Median : 34649 NULL:Africa
## Mean :3.285e+07 Mean : 400281 NULL:Oceania
## 3rd Qu.:2.079e+07 3rd Qu.: 198186 NULL:Europe
## Max. :1.382e+09 Max. :18561934 NULL:Africa
## NA's :27 NA's :66 NULL:Americas
## NULL:NA
## NULL:Americas
## NULL:Americas
## NULL:Asia
## NULL:Americas
## NULL:NA
## NULL:Oceania
## NULL:Europe
## NULL:Asia
## NULL:NA
## NULL:Americas
## NULL:Asia
## NULL:Asia
## NULL:Americas
## NULL:NA
## NULL:Europe
## NULL:Europe
## NULL:Americas
## NULL:Africa
## NULL:Americas
## NULL:Asia
## NULL:Americas
## NULL:NA
## NULL:Europe
## NULL:Africa
## NULL:Americas
## NULL:Asia
## NULL:Europe
## NULL:Africa
## NULL:Africa
## NULL:Asia
## NULL:Africa
## NULL:Americas
## NULL:NA
## NULL:Africa
## NULL:Americas
## NULL:Africa
## NULL:Africa
## NULL:NA
## NULL:Americas
## NULL:Asia
## NULL:NA
## NULL:NA
## NULL:Americas
## NULL:Africa
## NULL:Oceania
## NULL:Americas
## NULL:Europe
## NULL:Americas
## NULL:Americas
## NULL:Europe
## NULL:Europe
## NULL:Africa
## NULL:Europe
## NULL:Africa
## NULL:Americas
## NULL:Americas
## NULL:Americas
## NULL:Africa
## NULL:Americas
## NULL:Africa
## NULL:Africa
## NULL:Europe
## NULL:Africa
## NULL:Americas
## NULL:Europe
## NULL:Oceania
## NULL:Europe
## NULL:Europe
## NULL:Americas
## NULL:Oceania
## NULL:NA
## NULL:Africa
## NULL:Africa
## NULL:Asia
## NULL:Europe
## NULL:Africa
## NULL:Europe
## NULL:Americas
## NULL:Americas
## NULL:NA
## NULL:Americas
## NULL:Oceania
## NULL:Americas
## NULL:Europe
## NULL:Africa
## NULL:Africa
## NULL:Americas
## NULL:Americas
## NULL:NA
## NULL:Americas
## NULL:Europe
## NULL:Europe
## NULL:Asia
## NULL:Asia
## NULL:Asia
## NULL:Asia
## NULL:Europe
## NULL:Europe
## NULL:Asia
## NULL:Europe
## NULL:Africa
## NULL:Americas
## NULL:Asia
## NULL:NA
## NULL:Asia
## NULL:Asia
## NULL:Africa
## NULL:Oceania
## NULL:NA
## NULL:Asia
## NULL:Asia
## NULL:Asia
## NULL:Europe
## NULL:Asia
## NULL:Africa
## NULL:Africa
## NULL:Africa
## NULL:Europe
## NULL:Europe
## NULL:Europe
## NULL:Europe
## NULL:Africa
## NULL:NA
## NULL:Africa
## NULL:Asia
## NULL:Asia
## NULL:Africa
## NULL:Europe
## NULL:Oceania
## NULL:Americas
## NULL:Africa
## NULL:Africa
## NULL:Africa
## NULL:Americas
## NULL:NA
## NULL:Europe
## NULL:Europe
## NULL:Asia
## NULL:Europe
## NULL:Americas
## NULL:Africa
## NULL:Africa
## NULL:Asia
## NULL:Africa
## NULL:Oceania
## NULL:Asia
## NULL:Europe
## NULL:NA
## NULL:Oceania
## NULL:Oceania
## NULL:Americas
## NULL:Africa
## NULL:Africa
## NULL:Oceania
## NULL:NA
## NULL:Asia
## NULL:Oceania
## NULL:Europe
## NULL:Asia
## NULL:Asia
## NULL:Oceania
## NULL:Asia
## NULL:Americas
## NULL:Oceania
## NULL:Americas
## NULL:Americas
## NULL:Asia
## NULL:NA
## NULL:Europe
## NULL:Europe
## NULL:Americas
## NULL:Asia
## NULL:Africa
## NULL:Africa
## NULL:Europe
## NULL:Europe
## NULL:Africa
## NULL:NA
## NULL:NA
## NULL:Africa
## NULL:Americas
## NULL:Americas
## NULL:NA
## NULL:Americas
## NULL:Americas
## NULL:Oceania
## NULL:Europe
## NULL:Africa
## NULL:Asia
## NULL:Africa
## NULL:Europe
## NULL:Africa
## NULL:NA
## NULL:Africa
## NULL:Asia
## NULL:NA
## NULL:Americas
## NULL:Europe
## NULL:Europe
## NULL:Oceania
## NULL:Africa
## NULL:Africa
## NULL:NA
## NULL:Asia
## NULL:NA
## NULL:Africa
## NULL:Europe
## NULL:Asia
## NULL:Africa
## NULL:Americas
## NULL:Africa
## NULL:Europe
## NULL:Europe
## NULL:Asia
## NULL:Asia
## NULL:Asia
## NULL:Africa
## NULL:Asia
## NULL:Asia
## NULL:NA
## NULL:Africa
## NULL:Oceania
## NULL:Americas
## NULL:Africa
## NULL:Asia
## NULL:Asia
## NULL:Americas
## NULL:Africa
## NULL:Europe
## NULL:Europe
## NULL:Asia
## NULL:Americas
## NULL:Americas
## NULL:Asia
## NULL:Oceania
## NULL:Europe
## NULL:Americas
## NULL:Asia
## NULL:Americas
## NULL:Americas
## NULL:Oceania
## NULL:Africa
## NULL:Asia
## NULL:Africa
## NULL:Africa
## subregion
## NULL:Southern Asia
## NULL:Southern Europe
## NULL:Northern Africa
## NULL:Polynesia
## NULL:Southern Europe
## NULL:Middle Africa
## NULL:Caribbean
## NULL:NA
## NULL:Caribbean
## NULL:South America
## NULL:Western Asia
## NULL:Caribbean
## NULL:NA
## NULL:Australia and New Zealand
## NULL:Western Europe
## NULL:Western Asia
## NULL:NA
## NULL:Caribbean
## NULL:Western Asia
## NULL:Southern Asia
## NULL:Caribbean
## NULL:NA
## NULL:Eastern Europe
## NULL:Western Europe
## NULL:Central America
## NULL:Western Africa
## NULL:Northern America
## NULL:Southern Asia
## NULL:South America
## NULL:NA
## NULL:Southern Europe
## NULL:Southern Africa
## NULL:South America
## NULL:South-Eastern Asia
## NULL:Eastern Europe
## NULL:Western Africa
## NULL:Eastern Africa
## NULL:South-Eastern Asia
## NULL:Middle Africa
## NULL:Northern America
## NULL:NA
## NULL:Western Africa
## NULL:Caribbean
## NULL:Middle Africa
## NULL:Middle Africa
## NULL:NA
## NULL:South America
## NULL:Eastern Asia
## NULL:NA
## NULL:NA
## NULL:South America
## NULL:Eastern Africa
## NULL:Polynesia
## NULL:Central America
## NULL:Southern Europe
## NULL:Caribbean
## NULL:Caribbean
## NULL:Southern Europe
## NULL:Eastern Europe
## NULL:Middle Africa
## NULL:Northern Europe
## NULL:Eastern Africa
## NULL:Caribbean
## NULL:Caribbean
## NULL:South America
## NULL:Northern Africa
## NULL:Central America
## NULL:Middle Africa
## NULL:Eastern Africa
## NULL:Northern Europe
## NULL:Eastern Africa
## NULL:South America
## NULL:Northern Europe
## NULL:Melanesia
## NULL:Northern Europe
## NULL:Western Europe
## NULL:South America
## NULL:Polynesia
## NULL:NA
## NULL:Middle Africa
## NULL:Western Africa
## NULL:Western Asia
## NULL:Western Europe
## NULL:Western Africa
## NULL:Southern Europe
## NULL:Northern America
## NULL:Caribbean
## NULL:NA
## NULL:Caribbean
## NULL:Micronesia
## NULL:Central America
## NULL:Northern Europe
## NULL:Western Africa
## NULL:Western Africa
## NULL:South America
## NULL:Caribbean
## NULL:NA
## NULL:Central America
## NULL:Eastern Europe
## NULL:Northern Europe
## NULL:Southern Asia
## NULL:South-Eastern Asia
## NULL:Southern Asia
## NULL:Western Asia
## NULL:Northern Europe
## NULL:Northern Europe
## NULL:Western Asia
## NULL:Southern Europe
## NULL:Western Africa
## NULL:Caribbean
## NULL:Eastern Asia
## NULL:NA
## NULL:Western Asia
## NULL:Central Asia
## NULL:Eastern Africa
## NULL:Micronesia
## NULL:NA
## NULL:Western Asia
## NULL:Central Asia
## NULL:South-Eastern Asia
## NULL:Northern Europe
## NULL:Western Asia
## NULL:Southern Africa
## NULL:Western Africa
## NULL:Northern Africa
## NULL:Western Europe
## NULL:Northern Europe
## NULL:Western Europe
## NULL:Southern Europe
## NULL:Eastern Africa
## NULL:NA
## NULL:Eastern Africa
## NULL:South-Eastern Asia
## NULL:Southern Asia
## NULL:Western Africa
## NULL:Southern Europe
## NULL:Micronesia
## NULL:Caribbean
## NULL:Western Africa
## NULL:Eastern Africa
## NULL:Eastern Africa
## NULL:Central America
## NULL:NA
## NULL:Eastern Europe
## NULL:Western Europe
## NULL:Eastern Asia
## NULL:Southern Europe
## NULL:Caribbean
## NULL:Northern Africa
## NULL:Eastern Africa
## NULL:South-Eastern Asia
## NULL:Southern Africa
## NULL:Micronesia
## NULL:Southern Asia
## NULL:Western Europe
## NULL:NA
## NULL:Melanesia
## NULL:Australia and New Zealand
## NULL:Central America
## NULL:Western Africa
## NULL:Western Africa
## NULL:Polynesia
## NULL:NA
## NULL:Eastern Asia
## NULL:Micronesia
## NULL:Northern Europe
## NULL:Western Asia
## NULL:Southern Asia
## NULL:Micronesia
## NULL:Western Asia
## NULL:Central America
## NULL:Melanesia
## NULL:South America
## NULL:South America
## NULL:South-Eastern Asia
## NULL:NA
## NULL:Eastern Europe
## NULL:Southern Europe
## NULL:Caribbean
## NULL:Western Asia
## NULL:Middle Africa
## NULL:Eastern Africa
## NULL:Eastern Europe
## NULL:Eastern Europe
## NULL:Eastern Africa
## NULL:NA
## NULL:NA
## NULL:Western Africa
## NULL:Caribbean
## NULL:Caribbean
## NULL:NA
## NULL:Northern America
## NULL:Caribbean
## NULL:Polynesia
## NULL:Southern Europe
## NULL:Middle Africa
## NULL:Western Asia
## NULL:Western Africa
## NULL:Southern Europe
## NULL:Eastern Africa
## NULL:NA
## NULL:Western Africa
## NULL:South-Eastern Asia
## NULL:NA
## NULL:Caribbean
## NULL:Eastern Europe
## NULL:Southern Europe
## NULL:Melanesia
## NULL:Eastern Africa
## NULL:Southern Africa
## NULL:NA
## NULL:Eastern Asia
## NULL:NA
## NULL:Eastern Africa
## NULL:Southern Europe
## NULL:Southern Asia
## NULL:Northern Africa
## NULL:South America
## NULL:Southern Africa
## NULL:Northern Europe
## NULL:Western Europe
## NULL:Western Asia
## NULL:Eastern Asia
## NULL:Central Asia
## NULL:Eastern Africa
## NULL:South-Eastern Asia
## NULL:South-Eastern Asia
## NULL:NA
## NULL:Western Africa
## NULL:Polynesia
## NULL:Caribbean
## NULL:Northern Africa
## NULL:Western Asia
## NULL:Central Asia
## NULL:Caribbean
## NULL:Eastern Africa
## NULL:Northern Europe
## NULL:Eastern Europe
## NULL:Western Asia
## NULL:South America
## NULL:Northern America
## NULL:Central Asia
## NULL:Melanesia
## NULL:Southern Europe
## NULL:South America
## NULL:South-Eastern Asia
## NULL:Caribbean
## NULL:Caribbean
## NULL:Polynesia
## NULL:Northern Africa
## NULL:Western Asia
## NULL:Eastern Africa
## NULL:Eastern Africa
# Call str() with max.level = 2 on countries_spdf
str(countries_spdf, max.level=2)
## Formal class 'SpatialPolygonsDataFrame' [package "sp"] with 5 slots
## ..@ data :'data.frame': 253 obs. of 7 variables:
## ..@ polygons :List of 253
## .. .. [list output truncated]
## ..@ plotOrder : int [1:253] 8 184 40 241 48 33 14 86 114 10 ...
## ..@ bbox : num [1:2, 1:2] -180 -85.2 190.3 83.6
## .. ..- attr(*, "dimnames")=List of 2
## ..@ proj4string:Formal class 'CRS' [package "sp"] with 1 slot
# Plot countries_spdf
plot(countries_spdf)
# 169th element of countries_spdf@polygons: one
one <- countries_sp@polygons[[169]]
# Print one
print(one)
## An object of class "Polygons"
## Slot "Polygons":
## [[1]]
## An object of class "Polygon"
## Slot "labpt":
## [1] 131.161721 3.040651
##
## Slot "area":
## [1] 0.001400832
##
## Slot "hole":
## [1] FALSE
##
## Slot "ringDir":
## [1] 1
##
## Slot "coords":
## [,1] [,2]
## [1,] 131.1724 3.026221
## [2,] 131.1496 3.021875
## [3,] 131.1350 3.025244
## [4,] 131.1367 3.039453
## [5,] 131.1516 3.054101
## [6,] 131.1724 3.060596
## [7,] 131.1879 3.055615
## [8,] 131.1863 3.042090
## [9,] 131.1724 3.026221
##
##
## [[2]]
## An object of class "Polygon"
## Slot "labpt":
## [1] 134.580483 7.512956
##
## Slot "area":
## [1] 0.0263786
##
## Slot "hole":
## [1] FALSE
##
## Slot "ringDir":
## [1] 1
##
## Slot "coords":
## [,1] [,2]
## [1,] 134.5954 7.382031
## [2,] 134.5347 7.360645
## [3,] 134.5062 7.437109
## [4,] 134.5157 7.525781
## [5,] 134.5560 7.593945
## [6,] 134.5997 7.615772
## [7,] 134.6087 7.623584
## [8,] 134.6512 7.712109
## [9,] 134.6596 7.663281
## [10,] 134.6327 7.501318
## [11,] 134.5983 7.438281
## [12,] 134.5954 7.382031
##
##
##
## Slot "plotOrder":
## [1] 2 1
##
## Slot "labpt":
## [1] 134.580483 7.512956
##
## Slot "ID":
## [1] "Palau"
##
## Slot "area":
## [1] 0.02777943
# Call summary() on one
summary(one)
## Length Class Mode
## 1 Polygons S4
# Call str() on one with max.level = 2
str(one, max.level=2)
## Formal class 'Polygons' [package "sp"] with 5 slots
## ..@ Polygons :List of 2
## ..@ plotOrder: int [1:2] 2 1
## ..@ labpt : num [1:2] 134.58 7.51
## ..@ ID : chr "Palau"
## ..@ area : num 0.0278
# Grab the USA (happens to be 241) and name it as one
one <- countries_sp@polygons[[241]]
# str() with max.level = 2, on the Polygons slot of one
str(one@Polygons, max.level=2)
## List of 127
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## $ :Formal class 'Polygon' [package "sp"] with 5 slots
## [list output truncated]
# str() with max.level = 2, on the 6th element of the one@Polygons (audibles to #59, which is the continental USA)
str(one@Polygons[[59]], max.level=2)
## Formal class 'Polygon' [package "sp"] with 5 slots
## ..@ labpt : num [1:2] -99.1 39.5
## ..@ area : num 834
## ..@ hole : logi FALSE
## ..@ ringDir: int 1
## ..@ coords : num [1:1990, 1:2] -74.7 -74.7 -74.4 -74 -73.6 ...
# Call plot on the coords slot of 6th element of one@Polygons (audibles to #59, which is the continental USA)
plot(one@Polygons[[59]]@coords)
# Subset the 169th object of countries_spdf: usa (audibled this to use countries_sp and number 241)
usa <- countries_sp[241, ]
# Look at summary() of usa
summary(usa)
## Object of class SpatialPolygons
## Coordinates:
## min max
## x -178.19452 179.77998
## y 18.96392 71.40767
## Is projected: FALSE
## proj4string :
## [+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0]
# Look at str() of usa
str(usa, max.level=2)
## Formal class 'SpatialPolygons' [package "sp"] with 4 slots
## ..@ polygons :List of 1
## ..@ plotOrder : int 1
## ..@ bbox : num [1:2, 1:2] -178.2 19 179.8 71.4
## .. ..- attr(*, "dimnames")=List of 2
## ..@ proj4string:Formal class 'CRS' [package "sp"] with 1 slot
# Call plot() on usa
plot(usa)
## *** MAY NEED TO SKIP UNLESS CAN FIND/CREATE THE spdf ["name", "iso_a3", "population", "gdp", "region", "subregion"]
# Call head() and str() on the data slot of countries_spdf
head(countries_spdf@data)
## ctr name iso_a3 population gdp region
## Afghanistan 1 Afghanistan AFG 33369945 18395 Asia
## Albania 2 Albania ALB 2903700 12144 Europe
## Algeria 3 Algeria DZA 40375954 168318 Africa
## American Samoa 4 American Samoa ASM 55602 NA Oceania
## Andorra 5 Andorra AND 69165 NA Europe
## Angola 6 Angola AGO 25830958 91939 Africa
## subregion
## Afghanistan Southern Asia
## Albania Southern Europe
## Algeria Northern Africa
## American Samoa Polynesia
## Andorra Southern Europe
## Angola Middle Africa
str(countries_spdf@data)
## 'data.frame': 253 obs. of 7 variables:
## $ ctr : int 1 2 3 4 5 6 7 8 9 10 ...
## $ name : chr "Afghanistan" "Albania" "Algeria" "American Samoa" ...
## $ iso_a3 : chr "AFG" "ALB" "DZA" "ASM" ...
## $ population: num 33369945 2903700 40375954 55602 69165 ...
## $ gdp : num 18395 12144 168318 NA NA ...
## $ region : chr "Asia" "Europe" "Africa" "Oceania" ...
## $ subregion : chr "Southern Asia" "Southern Europe" "Northern Africa" "Polynesia" ...
# Pull out the name column using $
str(countries_spdf$name)
## chr [1:253] "Afghanistan" "Albania" "Algeria" ...
# Pull out the subregion column using [[
str(countries_spdf[["subregion"]])
## chr [1:253] "Southern Asia" "Southern Europe" "Northern Africa" ...
# Create logical vector: is_nz
is_nz <- countries_spdf$name == "New Zealand"
# Subset countries_spdf using is_nz: nz
nz <- countries_spdf[is_nz, ]
# Plot nz
plot(nz)
library(tmap)
# Use qtm() to create a choropleth map of gdp
qtm(shp = countries_spdf, fill = "gdp")
# Add style argument to the tm_fill() call
tm_shape(countries_spdf) +
tm_fill(col = "population", style="quantile") +
# Add a tm_borders() layer
tm_borders(col = "burlywood4")
# New plot, with tm_bubbles() instead of tm_fill()
tm_shape(countries_spdf) +
tm_bubbles(size = "population") +
# Add a tm_borders() layer
tm_borders(col = "burlywood4")
# Switch to a Hobo-Dyer projection (there is a problem with 74 (Fiji) and 184 (Russia))
tm_shape(countries_spdf[-c(74, 184), ], projection="hd") +
tm_grid(n.x = 11, n.y = 11) +
tm_fill(col = "population", style = "quantile") +
tm_borders(col = "burlywood4")
# Switch to a Robinson projection
tm_shape(countries_spdf[-c(74, 184), ], projection="robin") +
tm_grid(n.x = 11, n.y = 11) +
tm_fill(col = "population", style = "quantile") +
tm_borders(col = "burlywood4")
# Add tm_style_classic() to your plot
tm_shape(countries_spdf[-c(74, 184), ], projection="robin") +
tm_grid(n.x = 11, n.y = 11) +
tm_fill(col = "population", style = "quantile") +
tm_borders(col = "burlywood4") +
tm_style_classic()
# Plot from last exercise
tm_shape(countries_spdf[-c(74, 184), ]) +
tm_grid(n.x = 11, n.y = 11, projection = "longlat") +
tm_fill(col = "population", style = "quantile") +
tm_borders(col = "burlywood4")
# Save a static version "population.png"
save_tmap(filename = "population.png")
## Map saved to C:\Users\Dave\Documents\Personal\Learning\Coursera\RDirectory\RHomework\DataCamp\population.png
## Resolution: 2100 by 1500 pixels
## Size: 6.999999 by 4.999999 inches (300 dpi)
# Save an interactive version "population.html" (not run due to ActiveX blocking)
# save_tmap(filename = "population.html")
# Attempt to fix the Russia problem (only partially works)
is0 <- countries_spdf
slot(is0, "polygons") <- lapply(slot(is0, "polygons"), maptools::checkPolygonsHoles)
## Warning in checkPolygonsGEOS(x, properly = properly, useSTRtree =
## useSTRtree): Duplicate Polygon objects dropped: 2
## Warning in checkPolygonsGEOS(x, properly = properly, useSTRtree =
## useSTRtree): Duplicate Polygon objects dropped: 2
tm_shape(countries_spdf[-c(74, 184), ], projection="hd") +
tm_grid(n.x = 11, n.y = 11) +
tm_fill(col = "population", style = "quantile") +
tm_borders(col = "burlywood4")
tm_shape(is0, projection="hd") +
tm_grid(n.x = 11, n.y = 11) +
tm_fill(col = "population", style = "quantile") +
tm_borders(col = "burlywood4")
tm_shape(countries_spdf[-c(74, 184), ], projection="robin") +
tm_grid(n.x = 11, n.y = 11) +
tm_fill(col = "population", style = "quantile") +
tm_borders(col = "burlywood4") +
tm_style_classic()
tm_shape(is0, projection="robin") +
tm_grid(n.x = 11, n.y = 11) +
tm_fill(col = "population", style = "quantile") +
tm_borders(col = "burlywood4") +
tm_style_classic()
Chapter 3 - Raster Data and Color
Data frames are not an ideal way to store spatial data - duplication of data per polygon/point, no CRS storage, inefficient graphing/display:
Humans perceive color as trichromtic and three-dimensional (HCL) - hue, chroma, and luminance:
Color palettes help to convey these key meanings, and are implemented in various R packages:
Mapping of numbers to colors can vary by package:
Example code includes:
library(raster)
## Warning: package 'raster' was built under R version 3.2.5
##
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
##
## select
## The following objects are masked from 'package:Hmisc':
##
## mask, zoom
## The following objects are masked from 'package:MASS':
##
## area, select
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:data.table':
##
## shift
### *** LIKELY NEED TO COMMENT OUT DUE TO NOT HAVING FILE "pop"
# Print pop
# print(pop)
# Call str() on pop, with max.level = 2
# str(pop, max.level=2)
# Call summary on pop
# summary(pop)
# Call plot() on pop
# plot(pop)
# Call str() on values(pop)
# str(values(pop))
# Call head() on values(pop)
# head(values(pop))
# Print pop_by_age
# print(pop_by_age)
# Subset out the under_1 layer using [[
# pop_by_age[["under_1"]]
# Plot the under_1 layer
# plot(pop_by_age[["under_1"]])
library(tmap)
# Specify pop as the shp and add a tm_raster() layer
# tm_shape(pop) +
# tm_raster()
# Plot the under_1 layer in pop_by_age
# tm_shape(pop_by_age) +
# tm_raster(col="under_1")
# Call levelplot() on pop
# rasterVis::levelplot(pop)
hs_preds <-
"259 ; 257 ; 256 ; 254 ; 252 ; 250 ; 248 ; 246 ; 245 ; 243 ; 241 ; 239 ; 238 ; 237 ; 236 ; 235 ; 235 ; 236 ; 237 ; 239 ; 241 ; 243 ; 245 ; 248 ; 251 ; 254 ; 257 ; 259 ; 262 ; 265 ; 267 ; 270 ; 272 ; 274 ; 275 ; 277 ; 278 ; 279 ; 280 ; 281 ; 257 ; 255 ; 254 ; 252 ; 250 ; 249 ; 247 ; 245 ; 244 ; 242 ; 240 ; 239 ; 237 ; 236 ; 236 ; 235 ; 235 ; 236 ; 237 ; 238 ; 240 ; 243 ; 245 ; 248 ; 250 ; 253 ; 256 ; 258 ; 261 ; 263 ; 266 ; 268 ; 270 ; 272 ; 274 ; 275 ; 277 ; 278 ; 278 ; 279 ; 254 ; 253 ; 252 ; 250 ; 249 ; 247 ; 246 ; 245 ; 243 ; 242 ; 240 ; 239 ; 237 ; 236 ; 236 ; 235 ; 236 ; 236 ; 237 ; 239 ; 240 ; 243 ; 245 ; 247 ; 250 ; 252 ; 254 ; 257 ; 259 ; 262 ; 264 ; 266 ; 268 ; 270 ; 272 ; 273 ; 274 ; 275 ; 276 ; 277 ; 252 ; 250 ; 249 ; 248 ; 247 ; 246 ; 245 ; 244 ; 243 ; 241 ; 240 ; 239 ; 238 ; 237 ; 236 ; 236 ; 236 ; 237 ; 238 ; 239 ; 241 ; 243 ; 245 ; 247 ; 249 ; 251 ; 253 ; 255 ; 258 ; 260 ; 262 ; 264 ; 266 ; 268 ; 269 ; 271 ; 272 ; 273 ; 274 ; 274 ; 249 ; 248 ; 247 ; 246 ; 245 ; 245 ; 244 ; 243 ; 242 ; 241 ; 240 ; 239 ; 238 ; 237 ; 237 ; 237 ; 237 ; 237 ; 238 ; 240 ; 241 ; 243 ; 244 ; 246 ; 248 ; 250 ; 252 ; 254 ; 256 ; 257 ; 259 ; 261 ; 263 ; 264 ; 266 ; 267 ; 268 ; 269 ; 270 ; 271 ; 246 ; 245 ; 245 ; 244 ; 243 ; 243 ; 242 ; 242 ; 241 ; 240 ; 240 ; 239 ; 238 ; 237 ; 237 ; 237 ; 237 ; 238 ; 239 ; 240 ; 241 ; 243 ; 244 ; 246 ; 247 ; 249 ; 250 ; 252 ; 253 ; 254 ; 256 ; 257 ; 259 ; 260 ; 262 ; 263 ; 264 ; 265 ; 266 ; 267 ; 243 ; 242 ; 242 ; 241 ; 241 ; 240 ; 240 ; 240 ; 239 ; 239 ; 238 ; 238 ; 237 ; 237 ; 237 ; 237 ; 237 ; 238 ; 239 ; 240 ; 241 ; 242 ; 244 ; 245 ; 246 ; 247 ; 248 ; 249 ; 250 ; 251 ; 252 ; 253 ; 254 ; 255 ; 256 ; 258 ; 259 ; 260 ; 261 ; 263 ; 239 ; 239 ; 238 ; 238 ; 238 ; 238 ; 237 ; 237 ; 237 ; 237 ; 237 ; 236 ; 236 ; 236 ; 236 ; 237 ; 237 ; 238 ; 239 ; 240 ; 241 ; 242 ; 243 ; 244 ; 245 ; 245 ; 246 ; 246 ; 246 ; 247 ; 247 ; 248 ; 249 ; 250 ; 251 ; 252 ; 253 ; 255 ; 256 ; 258 ; 236 ; 235 ; 235 ; 235 ; 234 ; 234 ; 234 ; 234 ; 234 ; 234 ; 234 ; 234 ; 234 ; 235 ; 235 ; 236 ; 236 ; 237 ; 238 ; 195 ; 195 ; 195 ; 195 ; 196 ; 196 ; 196 ; 196 ; 197 ; 197 ; 198 ; 198 ; 199 ; 200 ; 201 ; 202 ; 203 ; 205 ; 208 ; 204 ; 198 "
preds <- data.frame(lat=rep(1:9, each=40),
lon=rep(1:40, times=9),
predicted_price=as.numeric(strsplit(hs_preds, " ; ")[[1]])
)
library(RColorBrewer)
# 9 steps on the RColorBrewer "BuPu" palette: blups
blups <- brewer.pal(n=9, "BuPu")
# Add scale_fill_gradientn() with the blups palette
ggplot(preds) +
geom_tile(aes(lon, lat, fill = predicted_price), alpha = 0.8) +
scale_fill_gradientn(colors=blups)
library(viridisLite)
## Warning: package 'viridisLite' was built under R version 3.2.5
##
## Attaching package: 'viridisLite'
## The following objects are masked from 'package:viridis':
##
## inferno, magma, plasma, viridis
# viridisLite viridis palette with 9 steps: vir
vir <- viridis(n=9)
# Add scale_fill_gradientn() with the vir palette
ggplot(preds) +
geom_tile(aes(lon, lat, fill = predicted_price), alpha = 0.8) +
scale_fill_gradientn(colors=vir)
# mag: a viridisLite magma palette with 9 steps
mag <- magma(n=9)
# Add scale_fill_gradientn() with the mag palette
ggplot(preds) +
geom_tile(aes(lon, lat, fill = predicted_price), alpha = 0.8) +
scale_fill_gradientn(colors=mag)
# Generate palettes from last time
library(RColorBrewer)
blups <- brewer.pal(9, "BuPu")
library(viridisLite)
vir <- viridis(9)
mag <- magma(9)
# Use the blups palette
# tm_shape(prop_by_age) +
# tm_raster("age_18_24", palette=blups) +
# tm_legend(position = c("right", "bottom"))
# Use the vir palette
# tm_shape(prop_by_age) +
# tm_raster("age_18_24", palette=vir) +
# tm_legend(position = c("right", "bottom"))
# Use the mag palette but reverse the order
# tm_shape(prop_by_age) +
# tm_raster("age_18_24", palette=rev(mag)) +
# tm_legend(position = c("right", "bottom"))
mag <- viridisLite::magma(7)
library(classInt)
## Warning: package 'classInt' was built under R version 3.2.5
# Create 5 "pretty" breaks with classIntervals()
# classIntervals(values(prop_by_age[["age_18_24"]]), n=5, style="pretty")
# Create 5 "quantile" breaks with classIntervals()
# classIntervals(values(prop_by_age[["age_18_24"]]), n=5, style="quantile")
# Use 5 "quantile" breaks in tm_raster()
# tm_shape(prop_by_age) +
# tm_raster("age_18_24", palette = mag, n=5, style="quantile") +
# tm_legend(position = c("right", "bottom"))
# Create histogram of proportions
# hist(values(prop_by_age[["age_18_24"]]))
# Use fixed breaks in tm_raster()
# tm_shape(prop_by_age) +
# tm_raster("age_18_24", palette = mag,
# style = "fixed", breaks=c(0.025, 0.05, 0.1, 0.2, 0.25, 0.3, 1))
# Save your plot to "prop_18-24.html"
# save_tmap(filename="prop_18-24.html")
# Print migration
# print(migration)
# Diverging "RdGy" palette
red_gray <- brewer.pal(n=7, "RdGy")
# Use red_gray as the palette
# tm_shape(migration) +
# tm_raster(palette=red_gray) +
# tm_legend(outside = TRUE, outside.position = c("bottom"))
# Add fixed breaks
# tm_shape(migration) +
# tm_raster(palette=red_gray, style="fixed", breaks=c(-5e6, -5e3, -5e2, -5e1, 5e1, 5e2, 5e3, 5e6)) +
# tm_legend(outside = TRUE, outside.position = c("bottom"))
library(raster)
# Plot land_cover
# tm_shape(land_cover) +
# tm_raster()
# Palette like the ggplot2 default
hcl_cols <- hcl(h = seq(15, 375, length = 9),
c = 100, l = 65)[-9]
# Use hcl_cols as the palette
# tm_shape(land_cover) +
# tm_raster(palette=hcl_cols)
# Examine levels of land_cover
# levels(land_cover)
# A set of intuitive colors
intuitive_cols <- c(
"darkgreen",
"darkolivegreen4",
"goldenrod2",
"seagreen",
"wheat",
"slategrey",
"white",
"lightskyblue1"
)
# Use intuitive_cols as palette
# tm_shape(land_cover) +
# tm_raster(palette=intuitive_cols) +
# tm_legend(position = c("left", "bottom"))
Chapter 4 - Data Import and Projections
Reading in spatial data - case study for creating map of income by census tract in Manhattan:
Coordinate reference systems stored with the data can be accessed using proj4string():
Adding data to spatial objects:
Polishing the map can make it even better - iterative process to improve the visuals:
Example code includes:
## downloaded "nynta_16d" from https://www1.nyc.gov/site/planning/data-maps/open-data/dwn-nynta.page)
library(sp)
library(rgdal)
# Call dir() with directory name
dir("./nynta_16d")
## [1] "nynta.dbf" "nynta.prj" "nynta.shp" "nynta.shp.xml"
## [5] "nynta.shx"
# Read in shapefile with readOGR(): neighborhoods
neighborhoods <- readOGR("nynta_16d", "nynta")
## OGR data source with driver: ESRI Shapefile
## Source: "nynta_16d", layer: "nynta"
## with 195 features
## It has 7 fields
# summary() of neighborhoods
summary(neighborhoods)
## Object of class SpatialPolygonsDataFrame
## Coordinates:
## min max
## x 913175.1 1067382.5
## y 120121.9 272844.3
## Is projected: TRUE
## proj4string :
## [+proj=lcc +lat_1=40.66666666666666 +lat_2=41.03333333333333
## +lat_0=40.16666666666666 +lon_0=-74 +x_0=300000 +y_0=0
## +datum=NAD83 +units=us-ft +no_defs +ellps=GRS80 +towgs84=0,0,0]
## Data attributes:
## BoroCode BoroName CountyFIPS NTACode
## Min. :1 Bronx :38 005:38 BK09 : 1
## 1st Qu.:2 Brooklyn :51 047:51 BK17 : 1
## Median :3 Manhattan :29 061:29 BK19 : 1
## Mean :3 Queens :58 081:58 BK21 : 1
## 3rd Qu.:4 Staten Island:19 085:19 BK23 : 1
## Max. :5 BK25 : 1
## (Other):189
## NTAName Shape_Leng
## Airport : 1 Min. : 11000
## Allerton-Pelham Gardens : 1 1st Qu.: 23824
## Annadale-Huguenot-Prince's Bay-Eltingville: 1 Median : 30550
## Arden Heights : 1 Mean : 42004
## Astoria : 1 3rd Qu.: 41877
## Auburndale : 1 Max. :490427
## (Other) :189
## Shape_Area
## Min. : 5573902
## 1st Qu.: 19392084
## Median : 32629789
## Mean : 43227702
## 3rd Qu.: 50237458
## Max. :327760120
##
# Plot neighboorhoods
plot(neighborhoods)
## need to find/create the census tract data file in raster format
library(raster)
# Call dir() on the directory
# dir("./nyc_grid_data")
# Use raster() with file path: income_grid
# income_grid <- raster("./nyc_grid_data/m5602ahhi00.tif")
# Call summary() on income_grid
# summary(income_grid)
# Call plot() on income_grid
# plot(income_grid)
library(sp)
# Call tracts(): nyc_tracts
nyc_tracts <- tigris::tracts(state="NY", county="New York", cb=TRUE)
# Call summary() on nyc_tracts
summary(nyc_tracts)
## Object of class SpatialPolygonsDataFrame
## Coordinates:
## min max
## x -74.04731 -73.90700
## y 40.68419 40.88207
## Is projected: FALSE
## proj4string :
## [+proj=longlat +datum=NAD83 +no_defs +ellps=GRS80 +towgs84=0,0,0]
## Data attributes:
## STATEFP COUNTYFP TRACTCE AFFGEOID
## NULL:36 NULL:061 NULL:001401 NULL:1400000US36061001401
## NULL:36 NULL:061 NULL:002201 NULL:1400000US36061002201
## NULL:36 NULL:061 NULL:003200 NULL:1400000US36061003200
## NULL:36 NULL:061 NULL:004000 NULL:1400000US36061004000
## NULL:36 NULL:061 NULL:005501 NULL:1400000US36061005501
## NULL:36 NULL:061 NULL:005600 NULL:1400000US36061005600
## NULL:36 NULL:061 NULL:007400 NULL:1400000US36061007400
## NULL:36 NULL:061 NULL:007700 NULL:1400000US36061007700
## NULL:36 NULL:061 NULL:008200 NULL:1400000US36061008200
## NULL:36 NULL:061 NULL:010200 NULL:1400000US36061010200
## NULL:36 NULL:061 NULL:011500 NULL:1400000US36061011500
## NULL:36 NULL:061 NULL:012800 NULL:1400000US36061012800
## NULL:36 NULL:061 NULL:014401 NULL:1400000US36061014401
## NULL:36 NULL:061 NULL:015002 NULL:1400000US36061015002
## NULL:36 NULL:061 NULL:015802 NULL:1400000US36061015802
## NULL:36 NULL:061 NULL:017300 NULL:1400000US36061017300
## NULL:36 NULL:061 NULL:018500 NULL:1400000US36061018500
## NULL:36 NULL:061 NULL:018600 NULL:1400000US36061018600
## NULL:36 NULL:061 NULL:022400 NULL:1400000US36061022400
## NULL:36 NULL:061 NULL:023600 NULL:1400000US36061023600
## NULL:36 NULL:061 NULL:024900 NULL:1400000US36061024900
## NULL:36 NULL:061 NULL:025300 NULL:1400000US36061025300
## NULL:36 NULL:061 NULL:026700 NULL:1400000US36061026700
## NULL:36 NULL:061 NULL:031704 NULL:1400000US36061031704
## NULL:36 NULL:061 NULL:001300 NULL:1400000US36061001300
## NULL:36 NULL:061 NULL:002000 NULL:1400000US36061002000
## NULL:36 NULL:061 NULL:003100 NULL:1400000US36061003100
## NULL:36 NULL:061 NULL:003602 NULL:1400000US36061003602
## NULL:36 NULL:061 NULL:004300 NULL:1400000US36061004300
## NULL:36 NULL:061 NULL:004700 NULL:1400000US36061004700
## NULL:36 NULL:061 NULL:006300 NULL:1400000US36061006300
## NULL:36 NULL:061 NULL:008700 NULL:1400000US36061008700
## NULL:36 NULL:061 NULL:009500 NULL:1400000US36061009500
## NULL:36 NULL:061 NULL:009800 NULL:1400000US36061009800
## NULL:36 NULL:061 NULL:010602 NULL:1400000US36061010602
## NULL:36 NULL:061 NULL:011100 NULL:1400000US36061011100
## NULL:36 NULL:061 NULL:012700 NULL:1400000US36061012700
## NULL:36 NULL:061 NULL:013000 NULL:1400000US36061013000
## NULL:36 NULL:061 NULL:013900 NULL:1400000US36061013900
## NULL:36 NULL:061 NULL:014500 NULL:1400000US36061014500
## NULL:36 NULL:061 NULL:014801 NULL:1400000US36061014801
## NULL:36 NULL:061 NULL:016400 NULL:1400000US36061016400
## NULL:36 NULL:061 NULL:016900 NULL:1400000US36061016900
## NULL:36 NULL:061 NULL:017401 NULL:1400000US36061017401
## NULL:36 NULL:061 NULL:018100 NULL:1400000US36061018100
## NULL:36 NULL:061 NULL:019200 NULL:1400000US36061019200
## NULL:36 NULL:061 NULL:019800 NULL:1400000US36061019800
## NULL:36 NULL:061 NULL:020300 NULL:1400000US36061020300
## NULL:36 NULL:061 NULL:021200 NULL:1400000US36061021200
## NULL:36 NULL:061 NULL:022200 NULL:1400000US36061022200
## NULL:36 NULL:061 NULL:022600 NULL:1400000US36061022600
## NULL:36 NULL:061 NULL:023501 NULL:1400000US36061023501
## NULL:36 NULL:061 NULL:024100 NULL:1400000US36061024100
## NULL:36 NULL:061 NULL:029100 NULL:1400000US36061029100
## NULL:36 NULL:061 NULL:030900 NULL:1400000US36061030900
## NULL:36 NULL:061 NULL:000201 NULL:1400000US36061000201
## NULL:36 NULL:061 NULL:003002 NULL:1400000US36061003002
## NULL:36 NULL:061 NULL:006700 NULL:1400000US36061006700
## NULL:36 NULL:061 NULL:008300 NULL:1400000US36061008300
## NULL:36 NULL:061 NULL:011402 NULL:1400000US36061011402
## NULL:36 NULL:061 NULL:016500 NULL:1400000US36061016500
## NULL:36 NULL:061 NULL:019000 NULL:1400000US36061019000
## NULL:36 NULL:061 NULL:021500 NULL:1400000US36061021500
## NULL:36 NULL:061 NULL:023300 NULL:1400000US36061023300
## NULL:36 NULL:061 NULL:026500 NULL:1400000US36061026500
## NULL:36 NULL:061 NULL:006500 NULL:1400000US36061006500
## NULL:36 NULL:061 NULL:020500 NULL:1400000US36061020500
## NULL:36 NULL:061 NULL:024700 NULL:1400000US36061024700
## NULL:36 NULL:061 NULL:001001 NULL:1400000US36061001001
## NULL:36 NULL:061 NULL:003001 NULL:1400000US36061003001
## NULL:36 NULL:061 NULL:006100 NULL:1400000US36061006100
## NULL:36 NULL:061 NULL:010100 NULL:1400000US36061010100
## NULL:36 NULL:061 NULL:014000 NULL:1400000US36061014000
## NULL:36 NULL:061 NULL:020701 NULL:1400000US36061020701
## NULL:36 NULL:061 NULL:000700 NULL:1400000US36061000700
## NULL:36 NULL:061 NULL:000900 NULL:1400000US36061000900
## NULL:36 NULL:061 NULL:001502 NULL:1400000US36061001502
## NULL:36 NULL:061 NULL:002400 NULL:1400000US36061002400
## NULL:36 NULL:061 NULL:002800 NULL:1400000US36061002800
## NULL:36 NULL:061 NULL:003601 NULL:1400000US36061003601
## NULL:36 NULL:061 NULL:003700 NULL:1400000US36061003700
## NULL:36 NULL:061 NULL:004800 NULL:1400000US36061004800
## NULL:36 NULL:061 NULL:005200 NULL:1400000US36061005200
## NULL:36 NULL:061 NULL:005700 NULL:1400000US36061005700
## NULL:36 NULL:061 NULL:006400 NULL:1400000US36061006400
## NULL:36 NULL:061 NULL:006600 NULL:1400000US36061006600
## NULL:36 NULL:061 NULL:006900 NULL:1400000US36061006900
## NULL:36 NULL:061 NULL:007000 NULL:1400000US36061007000
## NULL:36 NULL:061 NULL:007100 NULL:1400000US36061007100
## NULL:36 NULL:061 NULL:007500 NULL:1400000US36061007500
## NULL:36 NULL:061 NULL:007600 NULL:1400000US36061007600
## NULL:36 NULL:061 NULL:008000 NULL:1400000US36061008000
## NULL:36 NULL:061 NULL:008100 NULL:1400000US36061008100
## NULL:36 NULL:061 NULL:008601 NULL:1400000US36061008601
## NULL:36 NULL:061 NULL:008900 NULL:1400000US36061008900
## NULL:36 NULL:061 NULL:009600 NULL:1400000US36061009600
## NULL:36 NULL:061 NULL:009900 NULL:1400000US36061009900
## NULL:36 NULL:061 NULL:011700 NULL:1400000US36061011700
## NULL:36 NULL:061 NULL:012600 NULL:1400000US36061012600
## NULL:36 NULL:061 NULL:013100 NULL:1400000US36061013100
## NULL:36 NULL:061 NULL:013600 NULL:1400000US36061013600
## NULL:36 NULL:061 NULL:014200 NULL:1400000US36061014200
## NULL:36 NULL:061 NULL:014700 NULL:1400000US36061014700
## NULL:36 NULL:061 NULL:014900 NULL:1400000US36061014900
## NULL:36 NULL:061 NULL:015700 NULL:1400000US36061015700
## NULL:36 NULL:061 NULL:016002 NULL:1400000US36061016002
## NULL:36 NULL:061 NULL:016300 NULL:1400000US36061016300
## NULL:36 NULL:061 NULL:017000 NULL:1400000US36061017000
## NULL:36 NULL:061 NULL:017402 NULL:1400000US36061017402
## NULL:36 NULL:061 NULL:019300 NULL:1400000US36061019300
## NULL:36 NULL:061 NULL:019500 NULL:1400000US36061019500
## NULL:36 NULL:061 NULL:019900 NULL:1400000US36061019900
## NULL:36 NULL:061 NULL:020800 NULL:1400000US36061020800
## NULL:36 NULL:061 NULL:011300 NULL:1400000US36061011300
## NULL:36 NULL:061 NULL:021303 NULL:1400000US36061021303
## NULL:36 NULL:061 NULL:022301 NULL:1400000US36061022301
## NULL:36 NULL:061 NULL:022700 NULL:1400000US36061022700
## NULL:36 NULL:061 NULL:023200 NULL:1400000US36061023200
## NULL:36 NULL:061 NULL:023801 NULL:1400000US36061023801
## NULL:36 NULL:061 NULL:024200 NULL:1400000US36061024200
## NULL:36 NULL:061 NULL:024301 NULL:1400000US36061024301
## NULL:36 NULL:061 NULL:024500 NULL:1400000US36061024500
## NULL:36 NULL:061 NULL:025100 NULL:1400000US36061025100
## NULL:36 NULL:061 NULL:025900 NULL:1400000US36061025900
## NULL:36 NULL:061 NULL:026100 NULL:1400000US36061026100
## NULL:36 NULL:061 NULL:026900 NULL:1400000US36061026900
## NULL:36 NULL:061 NULL:027100 NULL:1400000US36061027100
## NULL:36 NULL:061 NULL:027300 NULL:1400000US36061027300
## NULL:36 NULL:061 NULL:028500 NULL:1400000US36061028500
## NULL:36 NULL:061 NULL:029300 NULL:1400000US36061029300
## NULL:36 NULL:061 NULL:031900 NULL:1400000US36061031900
## NULL:36 NULL:061 NULL:000100 NULL:1400000US36061000100
## NULL:36 NULL:061 NULL:000600 NULL:1400000US36061000600
## NULL:36 NULL:061 NULL:000500 NULL:1400000US36061000500
## NULL:36 NULL:061 NULL:000800 NULL:1400000US36061000800
## NULL:36 NULL:061 NULL:001002 NULL:1400000US36061001002
## NULL:36 NULL:061 NULL:001402 NULL:1400000US36061001402
## NULL:36 NULL:061 NULL:001600 NULL:1400000US36061001600
## NULL:36 NULL:061 NULL:002202 NULL:1400000US36061002202
## NULL:36 NULL:061 NULL:002500 NULL:1400000US36061002500
## NULL:36 NULL:061 NULL:002700 NULL:1400000US36061002700
## NULL:36 NULL:061 NULL:002900 NULL:1400000US36061002900
## NULL:36 NULL:061 NULL:003300 NULL:1400000US36061003300
## NULL:36 NULL:061 NULL:003400 NULL:1400000US36061003400
## NULL:36 NULL:061 NULL:003800 NULL:1400000US36061003800
## NULL:36 NULL:061 NULL:004100 NULL:1400000US36061004100
## NULL:36 NULL:061 NULL:004400 NULL:1400000US36061004400
## NULL:36 NULL:061 NULL:005400 NULL:1400000US36061005400
## NULL:36 NULL:061 NULL:005502 NULL:1400000US36061005502
## NULL:36 NULL:061 NULL:005800 NULL:1400000US36061005800
## NULL:36 NULL:061 NULL:006000 NULL:1400000US36061006000
## NULL:36 NULL:061 NULL:008400 NULL:1400000US36061008400
## NULL:36 NULL:061 NULL:008602 NULL:1400000US36061008602
## NULL:36 NULL:061 NULL:008800 NULL:1400000US36061008800
## NULL:36 NULL:061 NULL:009300 NULL:1400000US36061009300
## NULL:36 NULL:061 NULL:009700 NULL:1400000US36061009700
## NULL:36 NULL:061 NULL:010000 NULL:1400000US36061010000
## NULL:36 NULL:061 NULL:010300 NULL:1400000US36061010300
## NULL:36 NULL:061 NULL:010601 NULL:1400000US36061010601
## NULL:36 NULL:061 NULL:011401 NULL:1400000US36061011401
## NULL:36 NULL:061 NULL:011600 NULL:1400000US36061011600
## NULL:36 NULL:061 NULL:011900 NULL:1400000US36061011900
## NULL:36 NULL:061 NULL:012400 NULL:1400000US36061012400
## NULL:36 NULL:061 NULL:012900 NULL:1400000US36061012900
## NULL:36 NULL:061 NULL:023700 NULL:1400000US36061023700
## NULL:36 NULL:061 NULL:023900 NULL:1400000US36061023900
## NULL:36 NULL:061 NULL:013200 NULL:1400000US36061013200
## NULL:36 NULL:061 NULL:028100 NULL:1400000US36061028100
## NULL:36 NULL:061 NULL:028300 NULL:1400000US36061028300
## NULL:36 NULL:061 NULL:029500 NULL:1400000US36061029500
## NULL:36 NULL:061 NULL:030300 NULL:1400000US36061030300
## NULL:36 NULL:061 NULL:031703 NULL:1400000US36061031703
## NULL:36 NULL:061 NULL:013500 NULL:1400000US36061013500
## NULL:36 NULL:061 NULL:013700 NULL:1400000US36061013700
## NULL:36 NULL:061 NULL:014300 NULL:1400000US36061014300
## NULL:36 NULL:061 NULL:014402 NULL:1400000US36061014402
## NULL:36 NULL:061 NULL:014602 NULL:1400000US36061014602
## NULL:36 NULL:061 NULL:015001 NULL:1400000US36061015001
## NULL:36 NULL:061 NULL:015100 NULL:1400000US36061015100
## NULL:36 NULL:061 NULL:015200 NULL:1400000US36061015200
## NULL:36 NULL:061 NULL:015601 NULL:1400000US36061015601
## NULL:36 NULL:061 NULL:015801 NULL:1400000US36061015801
## NULL:36 NULL:061 NULL:015900 NULL:1400000US36061015900
## NULL:36 NULL:061 NULL:016100 NULL:1400000US36061016100
## NULL:36 NULL:061 NULL:016600 NULL:1400000US36061016600
## NULL:36 NULL:061 NULL:017100 NULL:1400000US36061017100
## NULL:36 NULL:061 NULL:017200 NULL:1400000US36061017200
## NULL:36 NULL:061 NULL:017900 NULL:1400000US36061017900
## NULL:36 NULL:061 NULL:018200 NULL:1400000US36061018200
## NULL:36 NULL:061 NULL:018400 NULL:1400000US36061018400
## NULL:36 NULL:061 NULL:018900 NULL:1400000US36061018900
## NULL:36 NULL:061 NULL:019100 NULL:1400000US36061019100
## NULL:36 NULL:061 NULL:019400 NULL:1400000US36061019400
## NULL:36 NULL:061 NULL:019600 NULL:1400000US36061019600
## NULL:36 NULL:061 NULL:020000 NULL:1400000US36061020000
## NULL:36 NULL:061 NULL:020101 NULL:1400000US36061020101
## NULL:36 NULL:061 NULL:020600 NULL:1400000US36061020600
## NULL:36 NULL:061 NULL:020901 NULL:1400000US36061020901
## NULL:36 NULL:061 NULL:021100 NULL:1400000US36061021100
## NULL:36 NULL:061 NULL:021400 NULL:1400000US36061021400
## NULL:36 NULL:061 NULL:021703 NULL:1400000US36061021703
## NULL:36 NULL:061 NULL:022500 NULL:1400000US36061022500
## NULL:36 NULL:061 NULL:022800 NULL:1400000US36061022800
## NULL:36 NULL:061 NULL:023000 NULL:1400000US36061023000
## NULL:36 NULL:061 NULL:023502 NULL:1400000US36061023502
## NULL:36 NULL:061 NULL:000202 NULL:1400000US36061000202
## NULL:36 NULL:061 NULL:025500 NULL:1400000US36061025500
## NULL:36 NULL:061 NULL:027500 NULL:1400000US36061027500
## NULL:36 NULL:061 NULL:029700 NULL:1400000US36061029700
## NULL:36 NULL:061 NULL:001800 NULL:1400000US36061001800
## NULL:36 NULL:061 NULL:004200 NULL:1400000US36061004200
## NULL:36 NULL:061 NULL:005900 NULL:1400000US36061005900
## NULL:36 NULL:061 NULL:007200 NULL:1400000US36061007200
## NULL:36 NULL:061 NULL:007800 NULL:1400000US36061007800
## NULL:36 NULL:061 NULL:009400 NULL:1400000US36061009400
## NULL:36 NULL:061 NULL:011000 NULL:1400000US36061011000
## NULL:36 NULL:061 NULL:012000 NULL:1400000US36061012000
## NULL:36 NULL:061 NULL:013300 NULL:1400000US36061013300
## NULL:36 NULL:061 NULL:014601 NULL:1400000US36061014601
## NULL:36 NULL:061 NULL:017800 NULL:1400000US36061017800
## NULL:36 NULL:061 NULL:020102 NULL:1400000US36061020102
## NULL:36 NULL:061 NULL:021600 NULL:1400000US36061021600
## NULL:36 NULL:061 NULL:022302 NULL:1400000US36061022302
## NULL:36 NULL:061 NULL:022900 NULL:1400000US36061022900
## NULL:36 NULL:061 NULL:024000 NULL:1400000US36061024000
## NULL:36 NULL:061 NULL:024302 NULL:1400000US36061024302
## NULL:36 NULL:061 NULL:010400 NULL:1400000US36061010400
## NULL:36 NULL:061 NULL:025700 NULL:1400000US36061025700
## NULL:36 NULL:061 NULL:016001 NULL:1400000US36061016001
## NULL:36 NULL:061 NULL:004500 NULL:1400000US36061004500
## NULL:36 NULL:061 NULL:001200 NULL:1400000US36061001200
## NULL:36 NULL:061 NULL:006200 NULL:1400000US36061006200
## NULL:36 NULL:061 NULL:007300 NULL:1400000US36061007300
## NULL:36 NULL:061 NULL:007900 NULL:1400000US36061007900
## NULL:36 NULL:061 NULL:010800 NULL:1400000US36061010800
## NULL:36 NULL:061 NULL:012200 NULL:1400000US36061012200
## NULL:36 NULL:061 NULL:013800 NULL:1400000US36061013800
## NULL:36 NULL:061 NULL:016800 NULL:1400000US36061016800
## NULL:36 NULL:061 NULL:018300 NULL:1400000US36061018300
## NULL:36 NULL:061 NULL:022000 NULL:1400000US36061022000
## NULL:36 NULL:061 NULL:023400 NULL:1400000US36061023400
## NULL:36 NULL:061 NULL:026300 NULL:1400000US36061026300
## NULL:36 NULL:061 NULL:027700 NULL:1400000US36061027700
## NULL:36 NULL:061 NULL:031100 NULL:1400000US36061031100
## NULL:36 NULL:061 NULL:001501 NULL:1400000US36061001501
## NULL:36 NULL:061 NULL:009200 NULL:1400000US36061009200
## NULL:36 NULL:061 NULL:016200 NULL:1400000US36061016200
## NULL:36 NULL:061 NULL:021900 NULL:1400000US36061021900
## NULL:36 NULL:061 NULL:028700 NULL:1400000US36061028700
## NULL:36 NULL:061 NULL:002100 NULL:1400000US36061002100
## NULL:36 NULL:061 NULL:003900 NULL:1400000US36061003900
## NULL:36 NULL:061 NULL:005000 NULL:1400000US36061005000
## NULL:36 NULL:061 NULL:009000 NULL:1400000US36061009000
## NULL:36 NULL:061 NULL:011201 NULL:1400000US36061011201
## NULL:36 NULL:061 NULL:012100 NULL:1400000US36061012100
## NULL:36 NULL:061 NULL:013400 NULL:1400000US36061013400
## NULL:36 NULL:061 NULL:014802 NULL:1400000US36061014802
## NULL:36 NULL:061 NULL:015602 NULL:1400000US36061015602
## NULL:36 NULL:061 NULL:016700 NULL:1400000US36061016700
## NULL:36 NULL:061 NULL:018700 NULL:1400000US36061018700
## NULL:36 NULL:061 NULL:019701 NULL:1400000US36061019701
## NULL:36 NULL:061 NULL:022102 NULL:1400000US36061022102
## NULL:36 NULL:061 NULL:023802 NULL:1400000US36061023802
## NULL:36 NULL:061 NULL:015300 NULL:1400000US36061015300
## NULL:36 NULL:061 NULL:029900 NULL:1400000US36061029900
## NULL:36 NULL:061 NULL:011203 NULL:1400000US36061011203
## NULL:36 NULL:061 NULL:017500 NULL:1400000US36061017500
## NULL:36 NULL:061 NULL:018800 NULL:1400000US36061018800
## NULL:36 NULL:061 NULL:018000 NULL:1400000US36061018000
## NULL:36 NULL:061 NULL:021800 NULL:1400000US36061021800
## NULL:36 NULL:061 NULL:017700 NULL:1400000US36061017700
## NULL:36 NULL:061 NULL:004900 NULL:1400000US36061004900
## NULL:36 NULL:061 NULL:010900 NULL:1400000US36061010900
## NULL:36 NULL:061 NULL:015500 NULL:1400000US36061015500
## NULL:36 NULL:061 NULL:008603 NULL:1400000US36061008603
## NULL:36 NULL:061 NULL:023100 NULL:1400000US36061023100
## NULL:36 NULL:061 NULL:011202 NULL:1400000US36061011202
## NULL:36 NULL:061 NULL:012500 NULL:1400000US36061012500
## NULL:36 NULL:061 NULL:006800 NULL:1400000US36061006800
## NULL:36 NULL:061 NULL:019702 NULL:1400000US36061019702
## NULL:36 NULL:061 NULL:030700 NULL:1400000US36061030700
## NULL:36 NULL:061 NULL:002602 NULL:1400000US36061002602
## NULL:36 NULL:061 NULL:002601 NULL:1400000US36061002601
## NULL:36 NULL:061 NULL:011800 NULL:1400000US36061011800
## NULL:36 NULL:061 NULL:015400 NULL:1400000US36061015400
## NULL:36 NULL:061 NULL:009100 NULL:1400000US36061009100
## NULL:36 NULL:061 NULL:021000 NULL:1400000US36061021000
## NULL:36 NULL:061 NULL:027900 NULL:1400000US36061027900
## GEOID NAME LSAD ALAND AWATER
## NULL:36061001401 NULL:14.01 NULL:CT NULL:93510 NULL:0
## NULL:36061002201 NULL:22.01 NULL:CT NULL:161667 NULL:0
## NULL:36061003200 NULL:32 NULL:CT NULL:217682 NULL:0
## NULL:36061004000 NULL:40 NULL:CT NULL:178340 NULL:0
## NULL:36061005501 NULL:55.01 NULL:CT NULL:124447 NULL:0
## NULL:36061005600 NULL:56 NULL:CT NULL:174425 NULL:0
## NULL:36061007400 NULL:74 NULL:CT NULL:174085 NULL:0
## NULL:36061007700 NULL:77 NULL:CT NULL:178798 NULL:0
## NULL:36061008200 NULL:82 NULL:CT NULL:172401 NULL:0
## NULL:36061010200 NULL:102 NULL:CT NULL:172891 NULL:0
## NULL:36061011500 NULL:115 NULL:CT NULL:175588 NULL:0
## NULL:36061012800 NULL:128 NULL:CT NULL:178790 NULL:0
## NULL:36061014401 NULL:144.01 NULL:CT NULL:74319 NULL:0
## NULL:36061015002 NULL:150.02 NULL:CT NULL:125571 NULL:0
## NULL:36061015802 NULL:158.02 NULL:CT NULL:51510 NULL:0
## NULL:36061017300 NULL:173 NULL:CT NULL:178602 NULL:0
## NULL:36061018500 NULL:185 NULL:CT NULL:130383 NULL:0
## NULL:36061018600 NULL:186 NULL:CT NULL:121738 NULL:0
## NULL:36061022400 NULL:224 NULL:CT NULL:194687 NULL:0
## NULL:36061023600 NULL:236 NULL:CT NULL:263158 NULL:127979
## NULL:36061024900 NULL:249 NULL:CT NULL:60659 NULL:0
## NULL:36061025300 NULL:253 NULL:CT NULL:169349 NULL:0
## NULL:36061026700 NULL:267 NULL:CT NULL:47338 NULL:0
## NULL:36061031704 NULL:317.04 NULL:CT NULL:219944 NULL:473123
## NULL:36061001300 NULL:13 NULL:CT NULL:312732 NULL:0
## NULL:36061002000 NULL:20 NULL:CT NULL:126283 NULL:137883
## NULL:36061003100 NULL:31 NULL:CT NULL:204969 NULL:0
## NULL:36061003602 NULL:36.02 NULL:CT NULL:89011 NULL:0
## NULL:36061004300 NULL:43 NULL:CT NULL:149239 NULL:0
## NULL:36061004700 NULL:47 NULL:CT NULL:165421 NULL:0
## NULL:36061006300 NULL:63 NULL:CT NULL:178187 NULL:0
## NULL:36061008700 NULL:87 NULL:CT NULL:165443 NULL:0
## NULL:36061009500 NULL:95 NULL:CT NULL:171964 NULL:0
## NULL:36061009800 NULL:98 NULL:CT NULL:177068 NULL:0
## NULL:36061010602 NULL:106.02 NULL:CT NULL:125711 NULL:73073
## NULL:36061011100 NULL:111 NULL:CT NULL:175465 NULL:0
## NULL:36061012700 NULL:127 NULL:CT NULL:174026 NULL:0
## NULL:36061013000 NULL:130 NULL:CT NULL:178226 NULL:0
## NULL:36061013900 NULL:139 NULL:CT NULL:181595 NULL:0
## NULL:36061014500 NULL:145 NULL:CT NULL:178321 NULL:0
## NULL:36061014801 NULL:148.01 NULL:CT NULL:51979 NULL:0
## NULL:36061016400 NULL:164 NULL:CT NULL:177390 NULL:0
## NULL:36061016900 NULL:169 NULL:CT NULL:180002 NULL:0
## NULL:36061017401 NULL:174.01 NULL:CT NULL:100812 NULL:0
## NULL:36061018100 NULL:181 NULL:CT NULL:138455 NULL:0
## NULL:36061019200 NULL:192 NULL:CT NULL:189854 NULL:175645
## NULL:36061019800 NULL:198 NULL:CT NULL:225452 NULL:0
## NULL:36061020300 NULL:203 NULL:CT NULL:178223 NULL:0
## NULL:36061021200 NULL:212 NULL:CT NULL:176416 NULL:0
## NULL:36061022200 NULL:222 NULL:CT NULL:181422 NULL:0
## NULL:36061022600 NULL:226 NULL:CT NULL:152979 NULL:0
## NULL:36061023501 NULL:235.01 NULL:CT NULL:179045 NULL:0
## NULL:36061024100 NULL:241 NULL:CT NULL:178781 NULL:207330
## NULL:36061029100 NULL:291 NULL:CT NULL:187479 NULL:0
## NULL:36061030900 NULL:309 NULL:CT NULL:302752 NULL:0
## NULL:36061000201 NULL:2.01 NULL:CT NULL:90233 NULL:75976
## NULL:36061003002 NULL:30.02 NULL:CT NULL:78525 NULL:0
## NULL:36061006700 NULL:67 NULL:CT NULL:168478 NULL:0
## NULL:36061008300 NULL:83 NULL:CT NULL:174158 NULL:0
## NULL:36061011402 NULL:114.02 NULL:CT NULL:98799 NULL:0
## NULL:36061016500 NULL:165 NULL:CT NULL:204317 NULL:0
## NULL:36061019000 NULL:190 NULL:CT NULL:103802 NULL:0
## NULL:36061021500 NULL:215 NULL:CT NULL:69710 NULL:0
## NULL:36061023300 NULL:233 NULL:CT NULL:232212 NULL:285722
## NULL:36061026500 NULL:265 NULL:CT NULL:195037 NULL:0
## NULL:36061006500 NULL:65 NULL:CT NULL:201131 NULL:0
## NULL:36061020500 NULL:205 NULL:CT NULL:281121 NULL:374540
## NULL:36061024700 NULL:247 NULL:CT NULL:256348 NULL:400307
## NULL:36061001001 NULL:10.01 NULL:CT NULL:80145 NULL:244297
## NULL:36061003001 NULL:30.01 NULL:CT NULL:124153 NULL:0
## NULL:36061006100 NULL:61 NULL:CT NULL:142017 NULL:0
## NULL:36061010100 NULL:101 NULL:CT NULL:175518 NULL:0
## NULL:36061014000 NULL:140 NULL:CT NULL:179108 NULL:0
## NULL:36061020701 NULL:207.01 NULL:CT NULL:47273 NULL:0
## NULL:36061000700 NULL:7 NULL:CT NULL:256720 NULL:242749
## NULL:36061000900 NULL:9 NULL:CT NULL:315105 NULL:460146
## NULL:36061001502 NULL:15.02 NULL:CT NULL:207277 NULL:146802
## NULL:36061002400 NULL:24 NULL:CT NULL:182345 NULL:470253
## NULL:36061002800 NULL:28 NULL:CT NULL:183616 NULL:0
## NULL:36061003601 NULL:36.01 NULL:CT NULL:99812 NULL:0
## NULL:36061003700 NULL:37 NULL:CT NULL:383376 NULL:322356
## NULL:36061004800 NULL:48 NULL:CT NULL:175756 NULL:0
## NULL:36061005200 NULL:52 NULL:CT NULL:168892 NULL:0
## NULL:36061005700 NULL:57 NULL:CT NULL:108054 NULL:0
## NULL:36061006400 NULL:64 NULL:CT NULL:179196 NULL:0
## NULL:36061006600 NULL:66 NULL:CT NULL:174704 NULL:0
## NULL:36061006900 NULL:69 NULL:CT NULL:220486 NULL:275547
## NULL:36061007000 NULL:70 NULL:CT NULL:177329 NULL:0
## NULL:36061007100 NULL:71 NULL:CT NULL:164996 NULL:0
## NULL:36061007500 NULL:75 NULL:CT NULL:165732 NULL:283835
## NULL:36061007600 NULL:76 NULL:CT NULL:174274 NULL:0
## NULL:36061008000 NULL:80 NULL:CT NULL:172626 NULL:0
## NULL:36061008100 NULL:81 NULL:CT NULL:174287 NULL:0
## NULL:36061008601 NULL:86.01 NULL:CT NULL:125340 NULL:499842
## NULL:36061008900 NULL:89 NULL:CT NULL:165578 NULL:0
## NULL:36061009600 NULL:96 NULL:CT NULL:175329 NULL:0
## NULL:36061009900 NULL:99 NULL:CT NULL:1002374 NULL:1130336
## NULL:36061011700 NULL:117 NULL:CT NULL:187347 NULL:176830
## NULL:36061012600 NULL:126 NULL:CT NULL:183653 NULL:0
## NULL:36061013100 NULL:131 NULL:CT NULL:174140 NULL:0
## NULL:36061013600 NULL:136 NULL:CT NULL:219496 NULL:111000
## NULL:36061014200 NULL:142 NULL:CT NULL:178987 NULL:0
## NULL:36061014700 NULL:147 NULL:CT NULL:65414 NULL:0
## NULL:36061014900 NULL:149 NULL:CT NULL:174047 NULL:0
## NULL:36061015700 NULL:157 NULL:CT NULL:182396 NULL:0
## NULL:36061016002 NULL:160.02 NULL:CT NULL:51424 NULL:0
## NULL:36061016300 NULL:163 NULL:CT NULL:263616 NULL:352996
## NULL:36061017000 NULL:170 NULL:CT NULL:182916 NULL:0
## NULL:36061017402 NULL:174.02 NULL:CT NULL:50730 NULL:0
## NULL:36061019300 NULL:193 NULL:CT NULL:175716 NULL:0
## NULL:36061019500 NULL:195 NULL:CT NULL:218436 NULL:166301
## NULL:36061019900 NULL:199 NULL:CT NULL:226590 NULL:192056
## NULL:36061020800 NULL:208 NULL:CT NULL:148566 NULL:0
## NULL:36061011300 NULL:113 NULL:CT NULL:175653 NULL:0
## NULL:36061021303 NULL:213.03 NULL:CT NULL:153292 NULL:0
## NULL:36061022301 NULL:223.01 NULL:CT NULL:186804 NULL:131253
## NULL:36061022700 NULL:227 NULL:CT NULL:150703 NULL:0
## NULL:36061023200 NULL:232 NULL:CT NULL:180486 NULL:0
## NULL:36061023801 NULL:238.01 NULL:CT NULL:352467 NULL:518659
## NULL:36061024200 NULL:242 NULL:CT NULL:347358 NULL:135920
## NULL:36061024301 NULL:243.01 NULL:CT NULL:105194 NULL:0
## NULL:36061024500 NULL:245 NULL:CT NULL:212070 NULL:0
## NULL:36061025100 NULL:251 NULL:CT NULL:159422 NULL:0
## NULL:36061025900 NULL:259 NULL:CT NULL:75469 NULL:0
## NULL:36061026100 NULL:261 NULL:CT NULL:156222 NULL:0
## NULL:36061026900 NULL:269 NULL:CT NULL:175570 NULL:0
## NULL:36061027100 NULL:271 NULL:CT NULL:192909 NULL:0
## NULL:36061027300 NULL:273 NULL:CT NULL:176946 NULL:0
## NULL:36061028500 NULL:285 NULL:CT NULL:121942 NULL:0
## NULL:36061029300 NULL:293 NULL:CT NULL:157076 NULL:0
## NULL:36061031900 NULL:319 NULL:CT NULL:103141 NULL:314424
## NULL:36061000100 NULL:1 NULL:CT NULL:78638 NULL:0
## NULL:36061000600 NULL:6 NULL:CT NULL:240406 NULL:176018
## NULL:36061000500 NULL:5 NULL:CT NULL:843918 NULL:3393360
## NULL:36061000800 NULL:8 NULL:CT NULL:220708 NULL:167906
## NULL:36061001002 NULL:10.02 NULL:CT NULL:199683 NULL:245138
## NULL:36061001402 NULL:14.02 NULL:CT NULL:113920 NULL:0
## NULL:36061001600 NULL:16 NULL:CT NULL:207377 NULL:0
## NULL:36061002202 NULL:22.02 NULL:CT NULL:56112 NULL:0
## NULL:36061002500 NULL:25 NULL:CT NULL:131517 NULL:94115
## NULL:36061002700 NULL:27 NULL:CT NULL:59282 NULL:0
## NULL:36061002900 NULL:29 NULL:CT NULL:273357 NULL:0
## NULL:36061003300 NULL:33 NULL:CT NULL:341677 NULL:0
## NULL:36061003400 NULL:34 NULL:CT NULL:160257 NULL:0
## NULL:36061003800 NULL:38 NULL:CT NULL:202371 NULL:0
## NULL:36061004100 NULL:41 NULL:CT NULL:191275 NULL:0
## NULL:36061004400 NULL:44 NULL:CT NULL:410012 NULL:465240
## NULL:36061005400 NULL:54 NULL:CT NULL:167959 NULL:0
## NULL:36061005502 NULL:55.02 NULL:CT NULL:115486 NULL:0
## NULL:36061005800 NULL:58 NULL:CT NULL:174669 NULL:0
## NULL:36061006000 NULL:60 NULL:CT NULL:118942 NULL:300815
## NULL:36061008400 NULL:84 NULL:CT NULL:173067 NULL:0
## NULL:36061008602 NULL:86.02 NULL:CT NULL:106753 NULL:317001
## NULL:36061008800 NULL:88 NULL:CT NULL:179256 NULL:0
## NULL:36061009300 NULL:93 NULL:CT NULL:179101 NULL:0
## NULL:36061009700 NULL:97 NULL:CT NULL:172931 NULL:0
## NULL:36061010000 NULL:100 NULL:CT NULL:172894 NULL:0
## NULL:36061010300 NULL:103 NULL:CT NULL:174470 NULL:0
## NULL:36061010601 NULL:106.01 NULL:CT NULL:121208 NULL:130046
## NULL:36061011401 NULL:114.01 NULL:CT NULL:98508 NULL:0
## NULL:36061011600 NULL:116 NULL:CT NULL:195582 NULL:132923
## NULL:36061011900 NULL:119 NULL:CT NULL:177591 NULL:0
## NULL:36061012400 NULL:124 NULL:CT NULL:171580 NULL:75103
## NULL:36061012900 NULL:129 NULL:CT NULL:404224 NULL:407508
## NULL:36061023700 NULL:237 NULL:CT NULL:271911 NULL:273621
## NULL:36061023900 NULL:239 NULL:CT NULL:63850 NULL:0
## NULL:36061013200 NULL:132 NULL:CT NULL:200886 NULL:92937
## NULL:36061028100 NULL:281 NULL:CT NULL:94227 NULL:0
## NULL:36061028300 NULL:283 NULL:CT NULL:151093 NULL:0
## NULL:36061029500 NULL:295 NULL:CT NULL:135565 NULL:0
## NULL:36061030300 NULL:303 NULL:CT NULL:156650 NULL:0
## NULL:36061031703 NULL:317.03 NULL:CT NULL:258090 NULL:467423
## NULL:36061013500 NULL:135 NULL:CT NULL:432329 NULL:401309
## NULL:36061013700 NULL:137 NULL:CT NULL:225671 NULL:0
## NULL:36061014300 NULL:143 NULL:CT NULL:2991654 NULL:567106
## NULL:36061014402 NULL:144.02 NULL:CT NULL:176863 NULL:0
## NULL:36061014602 NULL:146.02 NULL:CT NULL:109193 NULL:0
## NULL:36061015001 NULL:150.01 NULL:CT NULL:51643 NULL:0
## NULL:36061015100 NULL:151 NULL:CT NULL:333044 NULL:306023
## NULL:36061015200 NULL:152 NULL:CT NULL:189341 NULL:529192
## NULL:36061015601 NULL:156.01 NULL:CT NULL:73566 NULL:0
## NULL:36061015801 NULL:158.01 NULL:CT NULL:125754 NULL:0
## NULL:36061015900 NULL:159 NULL:CT NULL:175127 NULL:110770
## NULL:36061016100 NULL:161 NULL:CT NULL:155741 NULL:0
## NULL:36061016600 NULL:166 NULL:CT NULL:174218 NULL:0
## NULL:36061017100 NULL:171 NULL:CT NULL:217835 NULL:194050
## NULL:36061017200 NULL:172 NULL:CT NULL:177939 NULL:0
## NULL:36061017900 NULL:179 NULL:CT NULL:213916 NULL:190568
## NULL:36061018200 NULL:182 NULL:CT NULL:176899 NULL:0
## NULL:36061018400 NULL:184 NULL:CT NULL:176580 NULL:0
## NULL:36061018900 NULL:189 NULL:CT NULL:265399 NULL:0
## NULL:36061019100 NULL:191 NULL:CT NULL:223476 NULL:175654
## NULL:36061019400 NULL:194 NULL:CT NULL:175052 NULL:0
## NULL:36061019600 NULL:196 NULL:CT NULL:177186 NULL:0
## NULL:36061020000 NULL:200 NULL:CT NULL:129287 NULL:0
## NULL:36061020101 NULL:201.01 NULL:CT NULL:55127 NULL:0
## NULL:36061020600 NULL:206 NULL:CT NULL:148748 NULL:0
## NULL:36061020901 NULL:209.01 NULL:CT NULL:115805 NULL:0
## NULL:36061021100 NULL:211 NULL:CT NULL:347999 NULL:307945
## NULL:36061021400 NULL:214 NULL:CT NULL:135562 NULL:0
## NULL:36061021703 NULL:217.03 NULL:CT NULL:313864 NULL:0
## NULL:36061022500 NULL:225 NULL:CT NULL:262466 NULL:136149
## NULL:36061022800 NULL:228 NULL:CT NULL:180634 NULL:0
## NULL:36061023000 NULL:230 NULL:CT NULL:173684 NULL:0
## NULL:36061023502 NULL:235.02 NULL:CT NULL:41338 NULL:0
## NULL:36061000202 NULL:2.02 NULL:CT NULL:310039 NULL:428737
## NULL:36061025500 NULL:255 NULL:CT NULL:441356 NULL:586786
## NULL:36061027500 NULL:275 NULL:CT NULL:279689 NULL:852909
## NULL:36061029700 NULL:297 NULL:CT NULL:948118 NULL:1006331
## NULL:36061001800 NULL:18 NULL:CT NULL:222968 NULL:0
## NULL:36061004200 NULL:42 NULL:CT NULL:85723 NULL:0
## NULL:36061005900 NULL:59 NULL:CT NULL:116172 NULL:0
## NULL:36061007200 NULL:72 NULL:CT NULL:175015 NULL:0
## NULL:36061007800 NULL:78 NULL:CT NULL:177474 NULL:0
## NULL:36061009400 NULL:94 NULL:CT NULL:152754 NULL:0
## NULL:36061011000 NULL:110 NULL:CT NULL:176509 NULL:0
## NULL:36061012000 NULL:120 NULL:CT NULL:173024 NULL:0
## NULL:36061013300 NULL:133 NULL:CT NULL:174356 NULL:0
## NULL:36061014601 NULL:146.01 NULL:CT NULL:74328 NULL:0
## NULL:36061017800 NULL:178 NULL:CT NULL:248851 NULL:209809
## NULL:36061020102 NULL:201.02 NULL:CT NULL:83350 NULL:0
## NULL:36061021600 NULL:216 NULL:CT NULL:173198 NULL:0
## NULL:36061022302 NULL:223.02 NULL:CT NULL:60095 NULL:96898
## NULL:36061022900 NULL:229 NULL:CT NULL:211053 NULL:131446
## NULL:36061024000 NULL:240 NULL:CT NULL:2183933 NULL:793007
## NULL:36061024302 NULL:243.02 NULL:CT NULL:213258 NULL:113258
## NULL:36061010400 NULL:104 NULL:CT NULL:173232 NULL:0
## NULL:36061025700 NULL:257 NULL:CT NULL:144063 NULL:0
## NULL:36061016001 NULL:160.01 NULL:CT NULL:125615 NULL:0
## NULL:36061004500 NULL:45 NULL:CT NULL:104947 NULL:0
## NULL:36061001200 NULL:12 NULL:CT NULL:120597 NULL:0
## NULL:36061006200 NULL:62 NULL:CT NULL:293510 NULL:862281
## NULL:36061007300 NULL:73 NULL:CT NULL:165299 NULL:0
## NULL:36061007900 NULL:79 NULL:CT NULL:285956 NULL:502030
## NULL:36061010800 NULL:108 NULL:CT NULL:182589 NULL:0
## NULL:36061012200 NULL:122 NULL:CT NULL:172616 NULL:0
## NULL:36061013800 NULL:138 NULL:CT NULL:182091 NULL:0
## NULL:36061016800 NULL:168 NULL:CT NULL:199278 NULL:0
## NULL:36061018300 NULL:183 NULL:CT NULL:224312 NULL:187898
## NULL:36061022000 NULL:220 NULL:CT NULL:174955 NULL:0
## NULL:36061023400 NULL:234 NULL:CT NULL:108235 NULL:0
## NULL:36061026300 NULL:263 NULL:CT NULL:157201 NULL:0
## NULL:36061027700 NULL:277 NULL:CT NULL:163262 NULL:0
## NULL:36061031100 NULL:311 NULL:CT NULL:730774 NULL:312736
## NULL:36061001501 NULL:15.01 NULL:CT NULL:257851 NULL:175058
## NULL:36061009200 NULL:92 NULL:CT NULL:197023 NULL:0
## NULL:36061016200 NULL:162 NULL:CT NULL:272018 NULL:832556
## NULL:36061021900 NULL:219 NULL:CT NULL:279408 NULL:191783
## NULL:36061028700 NULL:287 NULL:CT NULL:532366 NULL:1018817
## NULL:36061002100 NULL:21 NULL:CT NULL:236794 NULL:0
## NULL:36061003900 NULL:39 NULL:CT NULL:331072 NULL:447417
## NULL:36061005000 NULL:50 NULL:CT NULL:166467 NULL:0
## NULL:36061009000 NULL:90 NULL:CT NULL:174692 NULL:0
## NULL:36061011201 NULL:112.01 NULL:CT NULL:79536 NULL:0
## NULL:36061012100 NULL:121 NULL:CT NULL:178161 NULL:0
## NULL:36061013400 NULL:134 NULL:CT NULL:181779 NULL:0
## NULL:36061014802 NULL:148.02 NULL:CT NULL:125620 NULL:0
## NULL:36061015602 NULL:156.02 NULL:CT NULL:108755 NULL:0
## NULL:36061016700 NULL:167 NULL:CT NULL:196978 NULL:141457
## NULL:36061018700 NULL:187 NULL:CT NULL:218107 NULL:184467
## NULL:36061019701 NULL:197.01 NULL:CT NULL:235976 NULL:0
## NULL:36061022102 NULL:221.02 NULL:CT NULL:99000 NULL:0
## NULL:36061023802 NULL:238.02 NULL:CT NULL:243417 NULL:313651
## NULL:36061015300 NULL:153 NULL:CT NULL:174116 NULL:0
## NULL:36061029900 NULL:299 NULL:CT NULL:616054 NULL:322917
## NULL:36061011203 NULL:112.03 NULL:CT NULL:77947 NULL:0
## NULL:36061017500 NULL:175 NULL:CT NULL:216905 NULL:192184
## NULL:36061018800 NULL:188 NULL:CT NULL:148532 NULL:0
## NULL:36061018000 NULL:180 NULL:CT NULL:212522 NULL:0
## NULL:36061021800 NULL:218 NULL:CT NULL:181869 NULL:0
## NULL:36061017700 NULL:177 NULL:CT NULL:174561 NULL:0
## NULL:36061004900 NULL:49 NULL:CT NULL:207320 NULL:0
## NULL:36061010900 NULL:109 NULL:CT NULL:175741 NULL:0
## NULL:36061015500 NULL:155 NULL:CT NULL:197390 NULL:214654
## NULL:36061008603 NULL:86.03 NULL:CT NULL:116670 NULL:153941
## NULL:36061023100 NULL:231 NULL:CT NULL:182674 NULL:0
## NULL:36061011202 NULL:112.02 NULL:CT NULL:78008 NULL:0
## NULL:36061012500 NULL:125 NULL:CT NULL:174253 NULL:0
## NULL:36061006800 NULL:68 NULL:CT NULL:174443 NULL:0
## NULL:36061019702 NULL:197.02 NULL:CT NULL:47396 NULL:0
## NULL:36061030700 NULL:307 NULL:CT NULL:84374 NULL:0
## NULL:36061002602 NULL:26.02 NULL:CT NULL:103692 NULL:0
## NULL:36061002601 NULL:26.01 NULL:CT NULL:106047 NULL:0
## NULL:36061011800 NULL:118 NULL:CT NULL:176768 NULL:0
## NULL:36061015400 NULL:154 NULL:CT NULL:177371 NULL:0
## NULL:36061009100 NULL:91 NULL:CT NULL:179099 NULL:0
## NULL:36061021000 NULL:210 NULL:CT NULL:203716 NULL:116287
## NULL:36061027900 NULL:279 NULL:CT NULL:180013 NULL:0
# Plot nyc_tracts
tigris::plot(nyc_tracts)
# proj4string() on nyc_tracts and neighborhoods
proj4string(nyc_tracts)
## [1] "+proj=longlat +datum=NAD83 +no_defs +ellps=GRS80 +towgs84=0,0,0"
proj4string(neighborhoods)
## [1] "+proj=lcc +lat_1=40.66666666666666 +lat_2=41.03333333333333 +lat_0=40.16666666666666 +lon_0=-74 +x_0=300000 +y_0=0 +datum=NAD83 +units=us-ft +no_defs +ellps=GRS80 +towgs84=0,0,0"
# coordinates() on nyc_tracts and neighborhoods
head(coordinates(nyc_tracts))
## [,1] [,2]
## 156 -73.98733 40.71516
## 157 -73.98184 40.71912
## 158 -73.98372 40.72556
## 159 -73.98603 40.73074
## 160 -73.99708 40.72762
## 161 -73.98730 40.74179
head(coordinates(neighborhoods))
## [,1] [,2]
## 0 987397.5 169148.4
## 1 1037640.2 214077.6
## 2 1043002.7 212969.8
## 3 1037005.2 219265.3
## 4 1020706.7 217413.9
## 5 1011274.1 240777.4
# plot() neighborhoods and nyc_tracts
plot(neighborhoods)
plot(nyc_tracts, col="red", add=TRUE)
# Use spTransform on neighborhoods: neighborhoods
neighborhoods <- sp::spTransform(neighborhoods, proj4string(nyc_tracts))
# head() on coordinates() of neighborhoods
head(coordinates(neighborhoods))
## [,1] [,2]
## 0 -73.98866 40.63095
## 1 -73.80729 40.75411
## 2 -73.78795 40.75104
## 3 -73.80955 40.76835
## 4 -73.86840 40.76335
## 5 -73.90235 40.82751
## *** NOTE THAT I DO NOT HAVE THE WATER DATA
# Plot neighborhoods, nyc_tracts and water
plot(neighborhoods)
plot(nyc_tracts, col="red", add=TRUE)
# plot(water, col="blue", add=TRUE)
# Example for how to screw up the data slot by assuming same orders!
# Use str() on nyc_income and nyc_tracts@data
# str(nyc_income)
# str(nyc_tracts@data)
# Highlight tract 002201 in nyc_tracts
# plot(nyc_tracts)
# plot(nyc_tracts[nyc_tracts$TRACTCE == "002201", ],
# col = "red", add = TRUE)
# Set nyc_tracts@data to nyc_income
# nyc_tracts@data <- nyc_income
# Highlight tract 002201 in nyc_tracts
# plot(nyc_tracts)
# plot(nyc_tracts[nyc_tracts$tract == "002201", ],
# col = "red", add = TRUE)
# Better approach - first, verify that the data can be matched up without errors
# Check for duplicates in nyc_income
# any(duplicated(nyc_income$tract))
# Check for duplicates in nyc_tracts
# any(duplicated(nyc_tracts$TRACTCE))
# Check nyc_tracts in nyc_income
# all(nyc_tracts$TRACTCE %in% nyc_income$tract)
# Check nyc_income in nyc_tracts
# all(nyc_income$tract %in% nyc_tracts$TRACTCE)
# Merge nyc_tracts and nyc_income: nyc_tracts_merge
# nyc_tracts_merge <- sp::merge(nyc_tracts, nyc_income, by.x="TRACTCE", by.y="tract")
# Call summary() on nyc_tracts_merge
# summary(nyc_tracts_merge)
# Choropleth with col mapped to estimate
# tm_shape(nyc_tracts_merge) +
# tm_fill(col="estimate")
# library(tmap)
#
# tm_shape(nyc_tracts_merge) +
# tm_fill(col = "estimate") +
# # Add a water layer, tm_fill() with col = "grey90"
# tm_shape(water) +
# tm_fill(col = "grey90") +
# # Add a neighborhood layer, tm_borders()
# tm_shape(neighborhoods) +
# tm_borders()
# Find unique() nyc_tracts_merge$COUNTYFP
# unique(nyc_tracts_merge$COUNTYFP)
# Add logical expression to pull out New York County
# manhat_hoods <- neighborhoods[neighborhoods$CountyFIPS == "061", ]
#
# tm_shape(nyc_tracts_merge) +
# tm_fill(col = "estimate") +
# tm_shape(water) +
# tm_fill(col = "grey90") +
# Edit to use manhat_hoods instead
# tm_shape(manhat_hoods) +
# tm_borders() +
# Add a tm_text() layer
# tm_text(text="NTAName")
# gsub() to replace " " with "\n"
# manhat_hoods$name <- gsub(" ", "\n", manhat_hoods$NTAName)
# gsub() to replace "-" with "/\n"
# manhat_hoods$name <- gsub("-", "/\n", manhat_hoods$name)
# Edit to map text to name, set size to 0.5
# tm_shape(nyc_tracts_merge) +
# tm_fill(col = "estimate") +
# tm_shape(water) +
# tm_fill(col = "grey90") +
# tm_shape(manhat_hoods) +
# tm_borders() +
# tm_text(text = "name", size=0.5)
# tm_shape(nyc_tracts_merge) +
# Add title and change palette
# tm_fill(col = "estimate",
# title="Median Income",
# palette="Greens") +
# Add tm_borders()
# tm_borders(col="grey60", lwd=0.5) +
# tm_shape(water) +
# tm_fill(col = "grey90") +
# tm_shape(manhat_hoods) +
# Change col and lwd of neighborhood boundaries
# tm_borders(col="grey40", lwd=2) +
# tm_text(text = "name", size = 0.5) +
# Add tm_credits()
# tm_credits("Source: ACS 2014 5-year Estimates, \n accessed via acs package", position=c("right", "bottom"))
# Save map as "nyc_income_map.png"
# save_tmap(filename="nyc_income_map.png", width=4, height=7)